Compare commits

...

22 Commits

Author SHA1 Message Date
56c1e059f9 Don't build tests by default, fix enlarge button 2020-12-31 10:43:03 -05:00
f87eac1f90 Update submodules 2020-12-31 10:26:05 -05:00
ddafbab6a6 Update readme 2020-12-31 10:26:05 -05:00
b91d574756 Add md5 client-side lib 2020-12-31 10:26:05 -05:00
576140e542 fix submodules 2020-12-31 10:26:05 -05:00
050c1283a3 Remove UUID dep, fix incremental scan, use MD5(path) as unique id, version bump 2020-12-31 10:26:05 -05:00
c6e1ba03bc Better support for .doc files 2020-12-31 10:26:05 -05:00
10e32f707f Update README.md 2020-12-31 10:26:05 -05:00
86e83bafaf Update README.md 2020-12-31 10:26:05 -05:00
51a40c8819 Add .doc support 2020-12-31 10:26:05 -05:00
acc557
36281a5108 Use relative path for loading csv in stats 2020-12-31 10:26:05 -05:00
acc557
76a0bda48b Update search.html
Fix relative stats URL
2020-12-31 10:26:05 -05:00
0cf29a660c Fix relative image URL #122 2020-12-31 10:26:05 -05:00
6cd0741848 update build instructions 2020-12-31 10:26:05 -05:00
bc120f349d Setup ARM CI builds 2020-12-23 10:26:26 -05:00
8cac8c98d7 Update dev builds template 2020-12-22 14:45:16 -05:00
30921ac52e Setup drone ci 2020-12-22 14:09:45 -05:00
95bbe39afc Update libmupdf 2020-10-25 09:44:30 -04:00
72ce217f9c Optionally ES schema from file #117 2020-10-25 09:44:30 -04:00
641a8ec90c sidecar files #114, version bump 2020-10-25 09:44:30 -04:00
7a505c2287 Fix typo 2020-10-25 09:44:30 -04:00
12f162d760 Fix #110 2020-10-25 09:44:30 -04:00
44 changed files with 988 additions and 625 deletions

56
.drone.yml Normal file
View File

@ -0,0 +1,56 @@
kind: pipeline
type: docker
name: amd64
platform:
os: linux
arch: amd64
steps:
- name: build
image: simon987/ubuntu_ci
commands:
- ./ci/build.sh
- name: scp files
image: appleboy/drone-scp
settings:
host:
from_secret: SSH_HOST
port:
from_secret: SSH_PORT
user:
from_secret: SSH_USER
key:
from_secret: SSH_KEY
target: /files/sist2/${DRONE_REPO_OWNER}_${DRONE_REPO_NAME}/${DRONE_BRANCH}_${DRONE_BUILD_NUMBER}_${DRONE_COMMIT}/
source:
- ./sist2.gz
- ./sist2_debug.tar.gz
---
kind: pipeline
type: docker
name: arm64
platform:
arch: arm64
steps:
- name: build
image: simon987/ubuntu_ci_arm
commands:
- ./ci/build_arm64.sh
- name: scp files
image: appleboy/drone-scp
settings:
host:
from_secret: SSH_HOST
port:
from_secret: SSH_PORT
user:
from_secret: SSH_USER
key:
from_secret: SSH_KEY
target: /files/sist2/${DRONE_REPO_OWNER}_${DRONE_REPO_NAME}/${DRONE_BRANCH}_${DRONE_BUILD_NUMBER}_${DRONE_COMMIT}/
source:
- ./sist2_arm64.gz

1
.gitignore vendored
View File

@ -1,6 +1,5 @@
.idea
thumbs
test
*.cbp
CMakeCache.txt
CMakeFiles

View File

@ -1,69 +0,0 @@
import jetbrains.buildServer.configs.kotlin.v2019_2.*
import jetbrains.buildServer.configs.kotlin.v2019_2.buildSteps.ExecBuildStep
import jetbrains.buildServer.configs.kotlin.v2019_2.buildSteps.exec
import jetbrains.buildServer.configs.kotlin.v2019_2.triggers.vcs
import jetbrains.buildServer.configs.kotlin.v2019_2.vcs.GitVcsRoot
/*
The settings script is an entry point for defining a TeamCity
project hierarchy. The script should contain a single call to the
project() function with a Project instance or an init function as
an argument.
VcsRoots, BuildTypes, Templates, and subprojects can be
registered inside the project using the vcsRoot(), buildType(),
template(), and subProject() methods respectively.
To debug settings scripts in command-line, run the
mvnDebug org.jetbrains.teamcity:teamcity-configs-maven-plugin:generate
command and attach your debugger to the port 8000.
To debug in IntelliJ Idea, open the 'Maven Projects' tool window (View
-> Tool Windows -> Maven Projects), find the generate task node
(Plugins -> teamcity-configs -> teamcity-configs:generate), the
'Debug' option is available in the context menu for the task.
*/
version = "2019.2"
project {
vcsRoot(HttpsGithubComSimon987sist2refsHeadsMaster)
buildType(Build)
}
object Build : BuildType({
name = "Build"
artifactRules = """
sist2
sist2_scan
""".trimIndent()
vcs {
root(HttpsGithubComSimon987sist2refsHeadsMaster)
}
steps {
exec {
name = "Build"
path = "./ci/build.sh"
dockerImage = "simon987/general_ci"
dockerImagePlatform = ExecBuildStep.ImagePlatform.Linux
dockerPull = true
}
}
triggers {
vcs {
}
}
})
object HttpsGithubComSimon987sist2refsHeadsMaster : GitVcsRoot({
name = "https://github.com/simon987/sist2#refs/heads/master"
url = "https://github.com/simon987/sist2"
})

View File

@ -5,12 +5,16 @@ project(sist2 C)
option(SIST_DEBUG "Build a debug executable" on)
set(BUILD_TESTS off)
add_subdirectory(third-party/libscan)
set(ARGPARSE_SHARED off)
add_subdirectory(third-party/argparse)
add_executable(
sist2
add_executable(sist2
# argparse
third-party/argparse/argparse.h third-party/argparse/argparse.c
src/main.c
src/sist.h
src/io/walk.h src/io/walk.c
@ -25,12 +29,9 @@ add_executable(
src/util.c src/util.h
src/ctx.h src/types.h
src/log.c src/log.h
# argparse
third-party/argparse/argparse.h third-party/argparse/argparse.c
src/cli.c src/cli.h
src/stats.c src/stats.h src/ctx.c)
src/stats.c src/stats.h src/ctx.c
src/parsing/sidecar.c src/parsing/sidecar.h)
target_link_directories(sist2 PRIVATE BEFORE ${_VCPKG_INSTALLED_DIR}/${VCPKG_TARGET_TRIPLET}/lib/)
set(CMAKE_FIND_LIBRARY_SUFFIXES .a .lib)
@ -39,7 +40,6 @@ find_package(lmdb CONFIG REQUIRED)
find_package(cJSON CONFIG REQUIRED)
find_package(unofficial-glib CONFIG REQUIRED)
find_package(unofficial-mongoose CONFIG REQUIRED)
find_library(UUID_LIB NAMES uuid)
find_package(CURL CONFIG REQUIRED)
#find_package(OpenSSL REQUIRED)
@ -67,13 +67,13 @@ if (SIST_DEBUG)
-fstack-protector
-fno-omit-frame-pointer
-fsanitize=address
-O2
-fno-inline
# -O2
)
target_link_options(
sist2
PRIVATE
-fsanitize=address
# -static
)
set_target_properties(
sist2
@ -81,7 +81,6 @@ if (SIST_DEBUG)
OUTPUT_NAME sist2_debug
)
else ()
# set(VCPKG_BUILD_TYPE release)
target_compile_options(
sist2
PRIVATE
@ -106,13 +105,13 @@ target_link_libraries(
argparse
unofficial::glib::glib
unofficial::mongoose::mongoose
# OpenSSL::SSL OpenSSL::Crypto
CURL::libcurl
${UUID_LIB}
pthread
magic
c
scan
)

View File

@ -1,8 +1,8 @@
![GitHub](https://img.shields.io/github/license/simon987/sist2.svg)
[![CodeFactor](https://www.codefactor.io/repository/github/simon987/sist2/badge?s=05daa325188aac4eae32c786f3d9cf4e0593f822)](https://www.codefactor.io/repository/github/simon987/sist2)
[![Development snapshots](https://ci.simon987.net/app/rest/builds/buildType(Sist2_Build)/statusIcon)](https://files.simon987.net/artifacts/Sist2/Build/)
[![Development snapshots](https://ci.simon987.net/api/badges/simon987/sist2/status.svg)](https://files.simon987.net/sist2/simon987_sist2/)
**Demo**: [sist2.simon987.net](https://sist2.simon987.net/)
**Demo**: [sist2.simon987.net](https://sist2.simon987.net/?i=Demo%20files)
# sist2
@ -52,7 +52,7 @@ sist2 (Simple incremental search tool)
```
1. Download sist2 executable
1. Download the [latest sist2 release](https://github.com/simon987/sist2/releases) *
1. *(or)* Download a [development snapshot](https://files.simon987.net/artifacts/Sist2/Build/) *(Not recommended!)*
1. *(or)* Download a [development snapshot](https://files.simon987.net/sist2/simon987_sist2/) *(Not recommended!)*
1. *(or)* `docker pull simon987/sist2:latest`
1. See [Usage guide](docs/USAGE.md)
@ -74,7 +74,7 @@ See [Usage guide](docs/USAGE.md) for more details
File type | Library | Content | Thumbnail | Metadata
:---|:---|:---|:---|:---
pdf,xps,fb2,epub | MuPDF | text+ocr | yes | title |
pdf,xps,fb2,epub | MuPDF | text+ocr | yes | author, title |
cbz,cbr | *(none)* | - | yes | - |
`audio/*` | ffmpeg | - | yes | ID3 tags |
`video/*` | ffmpeg | - | yes | title, comment, artist |
@ -85,6 +85,7 @@ ttf,ttc,cff,woff,fnt,otf | Freetype2 | - | yes, `bmp` | Name & style |
html, xml | *(none)* | yes | no | - |
tar, zip, rar, 7z, ar ... | Libarchive | yes\* | - | no |
docx, xlsx, pptx | *(none)* | yes | if embedded | creator, modified_by, title |
doc (MS Word 97-2003) | antiword | yes | yes | author, title |
mobi, azw, azw3 | libmobi | yes | no | author, title |
\* *See [Archive files](#archive-files)*
@ -126,12 +127,12 @@ binaries (GCC 7+ required).
1. Install compile-time dependencies
```bash
vcpkg install lmdb cjson glib libarchive[core,bzip2,libxml2,lz4,lzma,lzo] pthread tesseract libxml2 ffmpeg zstd gtest mongoose libuuid libmagic libraw curl[core,ssl] jbig2dec brotli libmupdf
vcpkg install lmdb cjson glib libarchive[core,bzip2,libxml2,lz4,lzma,lzo] pthread tesseract libxml2 ffmpeg zstd gtest mongoose libmagic libraw curl[core,ssl] jbig2dec brotli libmupdf
```
2. Build
```bash
git clone --recursive https://github.com/simon987/sist2/
cmake -DCMAKE_TOOLCHAIN_FILE=<VCPKG_ROOT>/scripts/buildsystems/vcpkg.cmake .
cmake -DSIST_DEBUG=off -DCMAKE_TOOLCHAIN_FILE=<VCPKG_ROOT>/scripts/buildsystems/vcpkg.cmake .
make
```

View File

@ -4,14 +4,16 @@ VCPKG_ROOT="/vcpkg"
rm *.gz
git submodule update --init --recursive
rm -rf CMakeFiles CMakeCache.txt
cmake -DSIST_DEBUG=off -DVCPKG_BUILD_TYPE=release -DCMAKE_TOOLCHAIN_FILE="${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake" .
make -j 12
cmake -DSIST_DEBUG=off -DCMAKE_TOOLCHAIN_FILE="${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake" .
make -j 33
strip sist2
gzip -9 sist2
rm -rf CMakeFiles CMakeCache.txt
cmake -DSIST_DEBUG=on -DVCPKG_BUILD_TYPE=debug -DCMAKE_TOOLCHAIN_FILE="${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake" .
make -j 12
cmake -DSIST_DEBUG=on -DCMAKE_TOOLCHAIN_FILE="${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake" .
make -j 33
cp /usr/lib/x86_64-linux-gnu/libasan.so.2.0.0 libasan.so.2
tar -czf sist2_debug.tar.gz sist2_debug libasan.so.2

View File

@ -4,6 +4,8 @@ VCPKG_ROOT="/vcpkg"
rm *.gz
git submodule update --init --recursive
rm -rf CMakeFiles CMakeCache.txt
cmake -DSIST_DEBUG=off -DCMAKE_TOOLCHAIN_FILE="${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake" .
make -j 4

View File

@ -16,6 +16,7 @@
* [link to specific indices](#link-to-specific-indices)
* [exec-script](#exec-script)
* [tagging](#tagging)
* [sidecar files](#sidecar-files)
```
Usage: sist2 scan [OPTION]... PATH
@ -52,6 +53,8 @@ Index options
--es-index=<str> Elasticsearch index name. DEFAULT=sist2
-p, --print Just print JSON documents to stdout.
--script-file=<str> Path to user script.
--mappings-file=<str> Path to Elasticsearch mappings.
--settings-file=<str> Path to Elasticsearch settings.
--async-script Execute user script asynchronously.
--batch-size=<int> Index batch size. DEFAULT: 100
-f, --force-reset Reset Elasticsearch mappings and settings. (You must use this option the first time you use the index command)
@ -153,10 +156,13 @@ documents.idx/
├── agg_mime.csv
├── agg_date.csv
├── add_size.csv
├── thumbs
├── thumbs/
| ├── data.mdb
| └── lock.mdb
└── tags
├── tags/
| ├── data.mdb
| └── lock.mdb
└── meta/
├── data.mdb
└── lock.mdb
```
@ -183,9 +189,11 @@ by a third party application. The 'external' index must have the following forma
my_index/
├── descriptor.json
├── _index_0
└── thumbs
├── data.mdb
└── lock.mdb
└── thumbs/
| ├── data.mdb
| └── lock.mdb
└── meta/
└── <empty>
```
*descriptor.json*:
@ -233,9 +241,11 @@ The `_text.*` items will be indexed and searchable as **text** fields (fuzzy sea
*thumbs/*:
LMDB key-value store. Keys are **binary** 128-bit UUID4s (`_id` field)
LMDB key-value store. Keys are **binary** 16-byte md5 hash* (`_id` field)
and values are raw image bytes.
*\* Hash is calculated from the full path of the file, including the extension, relative to the index root*
Importing an external `binary` type index is technically possible but
it is currently unsupported and has no guaranties of back/forward compatibility.
@ -251,6 +261,10 @@ it is currently unsupported and has no guaranties of back/forward compatibility.
Print index in JSON format to stdout.
* `--script-file`
Path to user script. See [Scripting](scripting.md).
* `--mappings-file`
Path to custom Elasticsearch mappings. If none is specified, [the bundled mappings](https://github.com/simon987/sist2/tree/master/schema) will be used.
* `--settings-file`
Path to custom Elasticsearch settings. *(See above)*
* `--async-script`
Use `wait_for_completion=false` elasticsearch option while executing user script.
(See [Elasticsearch documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/tasks.html))
@ -260,7 +274,6 @@ it is currently unsupported and has no guaranties of back/forward compatibility.
down the process.
* `-f, --force-reset`
Reset Elasticsearch mappings and settings.
**(You must use this option the first time you use the index command)**.
### Index examples
@ -343,9 +356,48 @@ See [Automatic tagging](#automatic-tagging) for information about tag
hierarchies and tag colors.
\* *It can take a few seconds to take effect in new search queries, and the page needs
to be reloaded for the tag tab to update*
to be reloaded for the tags tab to update*
### Automatic tagging
See [scripting](scripting.md) documentation.
# Sidecar files
When scanning, sist2 will read metadata from `.s2meta` JSON files and overwrite the
original document's metadata. Sidecar metadata files will also work inside archives.
Sidecar files themselves are not saved in the index.
This feature is useful to leverage third-party applications such as speech-to-text or
OCR to add additional metadata to a file.
**Example**
```
~/Documents/
├── Video.mp4
└── Video.mp4.s2meta
```
The sidecar file must have exactly the same file path and the `.s2meta` suffix.
`Video.mp4.s2meta`:
```json
{
"content": "This sidecar file will overwrite some metadata fields of Video.mp4",
"author": "Some author",
"duration": 12345,
"bitrate": 67890,
"some_arbitrary_field": [1,2,3]
}
```
```
sist2 scan ~/Documents -o ./docs.idx
sist2 index ./docs.idx
```
*NOTE*: It is technically possible to overwrite the `tag` value using sidecar files, however,
it is not currently possible to restore both manual tags and sidecar tags without user scripts
while reindexing.

View File

@ -30,6 +30,10 @@
"mime": {
"type": "keyword"
},
"parent": {
"type": "keyword",
"index": false
},
"thumbnail": {
"type": "keyword",
"index": false

View File

@ -449,3 +449,4 @@ image/x-sony-arw, arw
image/x-sony-sr2, sr2
image/x-sony-srf, srf
image/x-epson-erf, erf
sist2/sidecar, s2meta
1 application/arj arj
449 image/x-sony-sr2 sr2
450 image/x-sony-srf srf
451 image/x-epson-erf erf
452 sist2/sidecar s2meta

View File

@ -3,6 +3,7 @@ noparse = set()
ext_in_hash = set()
major_mime = {
"sist2": 0,
"model": 1,
"example": 2,
"message": 3,
@ -122,7 +123,11 @@ def mime_id(mime):
elif mime in raw:
mime_id += " | 0x00800000"
elif mime == "application/x-empty":
cnt -= 1
return "1"
elif mime == "sist2/sidecar":
cnt -= 1
return "2"
return mime_id

View File

@ -56,6 +56,12 @@ void scan_args_destroy(scan_args_t *args) {
void index_args_destroy(index_args_t *args) {
//todo
if (args->es_mappings_path) {
free(args->es_mappings);
}
if (args->es_settings_path) {
free(args->es_settings);
}
free(args);
}
@ -231,25 +237,25 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
return 0;
}
int load_script(const char *script_path, char **dst) {
int load_external_file(const char *file_path, char **dst) {
struct stat info;
int res = stat(script_path, &info);
int res = stat(file_path, &info);
if (res == -1) {
fprintf(stderr, "Error opening script file '%s': %s\n", script_path, strerror(errno));
LOG_ERRORF("cli.c", "Error opening file '%s': %s\n", file_path, strerror(errno))
return 1;
}
int fd = open(script_path, O_RDONLY);
int fd = open(file_path, O_RDONLY);
if (fd == -1) {
fprintf(stderr, "Error opening script file '%s': %s\n", script_path, strerror(errno));
LOG_ERRORF("cli.c", "Error opening file '%s': %s\n", file_path, strerror(errno))
return 1;
}
*dst = malloc(info.st_size + 1);
res = read(fd, *dst, info.st_size);
if (res < 0) {
fprintf(stderr, "Error reading script file '%s': %s\n", script_path, strerror(errno));
LOG_ERRORF("cli.c", "Error reading file '%s': %s\n", file_path, strerror(errno))
return 1;
}
@ -293,7 +299,19 @@ int index_args_validate(index_args_t *args, int argc, const char **argv) {
}
if (args->script_path != NULL) {
if (load_script(args->script_path, &args->script) != 0) {
if (load_external_file(args->script_path, &args->script) != 0) {
return 1;
}
}
if (args->es_settings_path != NULL) {
if (load_external_file(args->es_settings_path, &args->es_settings) != 0) {
return 1;
}
}
if (args->es_mappings_path != NULL) {
if (load_external_file(args->es_mappings_path, &args->es_mappings) != 0) {
return 1;
}
}
@ -309,6 +327,10 @@ int index_args_validate(index_args_t *args, int argc, const char **argv) {
LOG_DEBUGF("cli.c", "arg async_script=%s", args->async_script)
LOG_DEBUGF("cli.c", "arg script=%s", args->script)
LOG_DEBUGF("cli.c", "arg print=%d", args->print)
LOG_DEBUGF("cli.c", "arg es_mappings_path=%s", args->es_mappings_path)
LOG_DEBUGF("cli.c", "arg es_mappings=%s", args->es_mappings)
LOG_DEBUGF("cli.c", "arg es_settings_path=%s", args->es_settings_path)
LOG_DEBUGF("cli.c", "arg es_settings=%s", args->es_settings)
LOG_DEBUGF("cli.c", "arg batch_size=%d", args->batch_size)
LOG_DEBUGF("cli.c", "arg force_reset=%d", args->force_reset)
@ -445,7 +467,7 @@ int exec_args_validate(exec_args_t *args, int argc, const char **argv) {
LOG_FATAL("cli.c", "--script-file argument is required");
}
if (load_script(args->script_path, &args->script) != 0) {
if (load_external_file(args->script_path, &args->script) != 0) {
return 1;
}

View File

@ -39,6 +39,10 @@ typedef struct index_args {
const char *index_path;
const char *script_path;
char *script;
const char *es_settings_path;
char *es_settings;
const char *es_mappings_path;
char *es_mappings;
int print;
int batch_size;
int async_script;

View File

@ -13,6 +13,7 @@
#include "libscan/text/text.h"
#include "libscan/mobi/scan_mobi.h"
#include "libscan/raw/raw.h"
#include "libscan/msdoc/msdoc.h"
#include "src/io/store.h"
#include <glib.h>
@ -48,6 +49,7 @@ typedef struct {
scan_text_ctx_t text_ctx;
scan_mobi_ctx_t mobi_ctx;
scan_raw_ctx_t raw_ctx;
scan_msdoc_ctx_t msdoc_ctx;
} ScanCtx_t;
typedef struct {
@ -63,6 +65,8 @@ typedef struct {
tpool_t *pool;
store_t *tag_store;
GHashTable *tags;
store_t *meta_store;
GHashTable *meta;
} IndexCtx_t;
typedef struct {

View File

@ -30,11 +30,11 @@ void elastic_cleanup() {
}
}
void print_json(cJSON *document, const char uuid_str[UUID_STR_LEN]) {
void print_json(cJSON *document, const char id_str[MD5_STR_LENGTH]) {
cJSON *line = cJSON_CreateObject();
cJSON_AddStringToObject(line, "_id", uuid_str);
cJSON_AddStringToObject(line, "_id", id_str);
cJSON_AddStringToObject(line, "_index", IndexCtx.es_index);
cJSON_AddStringToObject(line, "_type", "_doc");
cJSON_AddItemReferenceToObject(line, "_source", document);
@ -52,13 +52,13 @@ void index_json_func(void *arg) {
elastic_index_line(line);
}
void index_json(cJSON *document, const char uuid_str[UUID_STR_LEN]) {
void index_json(cJSON *document, const char index_id_str[MD5_STR_LENGTH]) {
char *json = cJSON_PrintUnformatted(document);
size_t json_len = strlen(json);
es_bulk_line_t *bulk_line = malloc(sizeof(es_bulk_line_t) + json_len + 2);
memcpy(bulk_line->line, json, json_len);
memcpy(bulk_line->uuid_str, uuid_str, UUID_STR_LEN);
memcpy(bulk_line->path_md5_str, index_id_str, MD5_STR_LENGTH);
*(bulk_line->line + json_len) = '\n';
*(bulk_line->line + json_len + 1) = '\0';
bulk_line->next = NULL;
@ -67,7 +67,7 @@ void index_json(cJSON *document, const char uuid_str[UUID_STR_LEN]) {
tpool_add_work(IndexCtx.pool, index_json_func, bulk_line);
}
void execute_update_script(const char *script, int async, const char index_id[UUID_STR_LEN]) {
void execute_update_script(const char *script, int async, const char index_id[MD5_STR_LENGTH]) {
if (Indexer == NULL) {
Indexer = create_indexer(IndexCtx.es_url, IndexCtx.es_index);
@ -129,9 +129,9 @@ void *create_bulk_buffer(int max, int *count, size_t *buf_len) {
while (line != NULL && *count < max) {
char action_str[256];
snprintf(
action_str, 256,
action_str, sizeof(action_str),
"{\"index\":{\"_id\":\"%s\",\"_type\":\"_doc\",\"_index\":\"%s\"}}\n",
line->uuid_str, Indexer->es_index
line->path_md5_str, Indexer->es_index
);
size_t action_str_len = strlen(action_str);
@ -220,7 +220,7 @@ void _elastic_flush(int max) {
if (r->status_code == 413) {
if (max <= 1) {
LOG_ERRORF("elastic.c", "Single document too large, giving up: {%s}", Indexer->line_head->uuid_str)
LOG_ERRORF("elastic.c", "Single document too large, giving up: {%s}", Indexer->line_head->path_md5_str)
free_response(r);
free(buf);
delete_queue(1);
@ -356,7 +356,7 @@ void finish_indexer(char *script, int async_script, char *index_id) {
free_response(r);
}
void elastic_init(int force_reset) {
void elastic_init(int force_reset, const char* user_mappings, const char* user_settings) {
// Check if index exists
char url[4096];
@ -392,13 +392,13 @@ void elastic_init(int force_reset) {
free_response(r);
snprintf(url, sizeof(url), "%s/%s/_settings", IndexCtx.es_url, IndexCtx.es_index);
r = web_put(url, settings_json);
LOG_INFOF("elastic.c", "Update settings <%d>", r->status_code);
r = web_put(url, user_settings ? user_settings : settings_json);
LOG_INFOF("elastic.c", "Update user_settings <%d>", r->status_code);
free_response(r);
snprintf(url, sizeof(url), "%s/%s/_mappings/_doc?include_type_name=true", IndexCtx.es_url, IndexCtx.es_index);
r = web_put(url, mappings_json);
LOG_INFOF("elastic.c", "Update mappings <%d>", r->status_code);
r = web_put(url, user_mappings ? user_mappings : mappings_json);
LOG_INFOF("elastic.c", "Update user_mappings <%d>", r->status_code);
free_response(r);
snprintf(url, sizeof(url), "%s/%s/_open", IndexCtx.es_url, IndexCtx.es_index);
@ -408,9 +408,9 @@ void elastic_init(int force_reset) {
}
}
cJSON *elastic_get_document(const char *uuid_str) {
cJSON *elastic_get_document(const char *id_str) {
char url[4096];
snprintf(url, sizeof(url), "%s/%s/_doc/%s", WebCtx.es_url, WebCtx.es_index, uuid_str);
snprintf(url, sizeof(url), "%s/%s/_doc/%s", WebCtx.es_url, WebCtx.es_index, id_str);
response_t *r = web_get(url, 3);
cJSON *json = NULL;

View File

@ -5,7 +5,7 @@
typedef struct es_bulk_line {
struct es_bulk_line *next;
char uuid_str[UUID_STR_LEN];
char path_md5_str[MD5_STR_LENGTH];
char line[0];
} es_bulk_line_t;
@ -16,21 +16,21 @@ typedef struct es_indexer es_indexer_t;
void elastic_index_line(es_bulk_line_t *line);
void print_json(cJSON *document, const char uuid_str[UUID_STR_LEN]);
void print_json(cJSON *document, const char index_id_str[MD5_STR_LENGTH]);
void index_json(cJSON *document, const char uuid_str[UUID_STR_LEN]);
void index_json(cJSON *document, const char index_id_str[MD5_STR_LENGTH]);
es_indexer_t *create_indexer(const char *url, const char *index);
void elastic_cleanup();
void finish_indexer(char *script, int async_script, char *index_id);
void elastic_init(int force_reset);
void elastic_init(int force_reset, const char* user_mappings, const char* user_settings);
cJSON *elastic_get_document(const char *uuid_str);
cJSON *elastic_get_document(const char *id_str);
char *elastic_get_status();
void execute_update_script(const char *script, int async, const char index_id[UUID_STR_LEN]);
void execute_update_script(const char *script, int async, const char index_id[MD5_STR_LENGTH]);
#endif

File diff suppressed because one or more lines are too long

View File

@ -6,13 +6,13 @@
static __thread int index_fd = -1;
typedef struct {
unsigned char uuid[16];
unsigned long ino;
unsigned char path_md5[MD5_DIGEST_LENGTH];
unsigned long size;
unsigned int mime;
int mtime;
short base;
short ext;
char has_parent;
} line_t;
void skip_meta(FILE *file) {
@ -32,7 +32,7 @@ void skip_meta(FILE *file) {
void write_index_descriptor(char *path, index_descriptor_t *desc) {
cJSON *json = cJSON_CreateObject();
cJSON_AddStringToObject(json, "uuid", desc->uuid);
cJSON_AddStringToObject(json, "id", desc->id);
cJSON_AddStringToObject(json, "version", desc->version);
cJSON_AddStringToObject(json, "root", desc->root);
cJSON_AddStringToObject(json, "name", desc->name);
@ -82,7 +82,7 @@ index_descriptor_t read_index_descriptor(char *path) {
strcpy(descriptor.rewrite_url, cJSON_GetObjectItem(json, "rewrite_url")->valuestring);
descriptor.root_len = (short) strlen(descriptor.root);
strcpy(descriptor.version, cJSON_GetObjectItem(json, "version")->valuestring);
strcpy(descriptor.uuid, cJSON_GetObjectItem(json, "uuid")->valuestring);
strcpy(descriptor.id, cJSON_GetObjectItem(json, "id")->valuestring);
if (cJSON_GetObjectItem(json, "type") == NULL) {
strcpy(descriptor.type, INDEX_TYPE_BIN);
} else {
@ -219,7 +219,7 @@ void read_index_bin(const char *path, const char *index_id, index_func func) {
dyn_buffer_t buf = dyn_buffer_create();
FILE *file = fopen(path, "rb");
while (1) {
while (TRUE) {
buf.cur = 0;
size_t _ = fread((void *) &line, 1, sizeof(line_t), file);
if (feof(file)) {
@ -229,8 +229,8 @@ void read_index_bin(const char *path, const char *index_id, index_func func) {
cJSON *document = cJSON_CreateObject();
cJSON_AddStringToObject(document, "index", index_id);
char uuid_str[UUID_STR_LEN];
uuid_unparse(line.uuid, uuid_str);
char path_md5_str[MD5_STR_LENGTH];
buf2hex(line.path_md5, sizeof(line.path_md5), path_md5_str);
const char *mime_text = mime_get_mime_text(line.mime);
if (mime_text == NULL) {
@ -247,14 +247,6 @@ void read_index_bin(const char *path, const char *index_id, index_func func) {
}
dyn_buffer_write_char(&buf, '\0');
if (IndexCtx.tags != NULL) {
const char *tags_string = g_hash_table_lookup(IndexCtx.tags, buf.buf);
if (tags_string != NULL) {
cJSON *tags_arr = cJSON_Parse(tags_string);
cJSON_AddItemToObject(document, "tag", tags_arr);
}
}
cJSON_AddStringToObject(document, "extension", buf.buf + line.ext);
if (*(buf.buf + line.ext - 1) == '.') {
*(buf.buf + line.ext - 1) = '\0';
@ -334,8 +326,36 @@ void read_index_bin(const char *path, const char *index_id, index_func func) {
key = getc(file);
}
func(document, uuid_str);
cJSON *meta_obj = NULL;
if (IndexCtx.meta != NULL) {
const char *meta_string = g_hash_table_lookup(IndexCtx.meta, path_md5_str);
if (meta_string != NULL) {
meta_obj = cJSON_Parse(meta_string);
cJSON *child;
for (child = meta_obj->child; child != NULL; child = child->next) {
char meta_key[4096];
strcpy(meta_key, child->string);
cJSON_DeleteItemFromObject(document, meta_key);
cJSON_AddItemReferenceToObject(document, meta_key, child);
}
}
}
if (IndexCtx.tags != NULL) {
const char *tags_string = g_hash_table_lookup(IndexCtx.tags, path_md5_str);
if (tags_string != NULL) {
cJSON *tags_arr = cJSON_Parse(tags_string);
cJSON_DeleteItemFromObject(document, "tag");
cJSON_AddItemToObject(document, "tag", tags_arr);
}
}
func(document, path_md5_str);
cJSON_Delete(document);
if (meta_obj) {
cJSON_Delete(meta_obj);
}
}
dyn_buffer_destroy(&buf);
fclose(file);
@ -359,7 +379,7 @@ const char *json_type_array_fields[] = {
void read_index_json(const char *path, UNUSED(const char *index_id), index_func func) {
FILE *file = fopen(path, "r");
while (1) {
while (TRUE) {
char *line = NULL;
size_t len;
size_t read = getline(&line, &len, file);
@ -379,7 +399,7 @@ void read_index_json(const char *path, UNUSED(const char *index_id), index_func
}
cJSON *document = cJSON_CreateObject();
const char *uuid_str = cJSON_GetObjectItem(input, "_id")->valuestring;
const char *id_str = cJSON_GetObjectItem(input, "_id")->valuestring;
for (int i = 0; i < (sizeof(json_type_copy_fields) / sizeof(json_type_copy_fields[0])); i++) {
cJSON *value = cJSON_GetObjectItem(input, json_type_copy_fields[i]);
@ -407,7 +427,7 @@ void read_index_json(const char *path, UNUSED(const char *index_id), index_func
}
}
func(document, uuid_str);
func(document, id_str);
cJSON_Delete(document);
cJSON_Delete(input);
@ -415,7 +435,7 @@ void read_index_json(const char *path, UNUSED(const char *index_id), index_func
fclose(file);
}
void read_index(const char *path, const char index_id[UUID_STR_LEN], const char *type, index_func func) {
void read_index(const char *path, const char index_id[MD5_STR_LENGTH], const char *type, index_func func) {
if (strcmp(type, INDEX_TYPE_BIN) == 0) {
read_index_bin(path, index_id, func);
@ -428,13 +448,15 @@ void incremental_read(GHashTable *table, const char *filepath) {
FILE *file = fopen(filepath, "rb");
line_t line;
LOG_DEBUGF("serialize.c", "Incremental read %s", filepath)
while (1) {
size_t ret = fread((void *) &line, 1, sizeof(line_t), file);
size_t ret = fread((void *) &line, sizeof(line_t), 1, file);
if (ret != 1 || feof(file)) {
break;
}
incremental_put(table, line.ino, line.mtime);
incremental_put(table, line.path_md5, line.mtime);
while ((getc(file))) {}
skip_meta(file);
@ -452,33 +474,47 @@ void incremental_copy(store_t *store, store_t *dst_store, const char *filepath,
FILE *dst_file = fopen(dst_filepath, "ab");
line_t line;
while (1) {
size_t ret = fread((void *) &line, 1, sizeof(line_t), file);
LOG_DEBUGF("serialize.c", "Incremental copy %s", filepath)
while (TRUE) {
size_t ret = fread((void *) &line, sizeof(line_t), 1, file);
if (ret != 1 || feof(file)) {
break;
}
if (incremental_get(copy_table, line.ino)) {
// Assume that files with parents still exist.
// One way to "fix" this would be to check if the parent is marked for copy but it would consistently
// delete files with grandparents, which is a side-effect worse than having orphaned files
if (line.has_parent || incremental_get(copy_table, line.path_md5)) {
fwrite(&line, sizeof(line), 1, dst_file);
size_t buf_len;
char *buf = store_read(store, (char *) line.uuid, 16, &buf_len);
store_write(dst_store, (char *) line.uuid, 16, buf, buf_len);
free(buf);
// Copy filepath
char filepath_buf[PATH_MAX];
char c;
char *ptr = filepath_buf;
while ((c = (char) getc(file))) {
fwrite(&c, sizeof(c), 1, dst_file);
*ptr++ = c;
}
*ptr = '\0';
fwrite(filepath_buf, (ptr - filepath_buf) + 1, 1, dst_file);
// Copy tn store contents
size_t buf_len;
char path_md5[MD5_DIGEST_LENGTH];
MD5((unsigned char *) filepath_buf, (ptr - filepath_buf), (unsigned char *) path_md5);
char *buf = store_read(store, path_md5, sizeof(path_md5), &buf_len);
if (buf_len != 0) {
store_write(dst_store, path_md5, sizeof(path_md5), buf, buf_len);
free(buf);
}
fwrite("\0", sizeof(c), 1, dst_file);
enum metakey key;
while (1) {
key = getc(file);
fwrite(&key, sizeof(char), 1, dst_file);
if (key == '\n') {
break;
}
fwrite(&key, sizeof(char), 1, dst_file);
if (IS_META_INT(key)) {
int val;
@ -494,14 +530,12 @@ void incremental_copy(store_t *store, store_t *dst_store, const char *filepath,
}
fwrite("\0", sizeof(c), 1, dst_file);
}
if (ret != 1) {
break;
}
}
} else {
while ((getc(file))) {}
skip_meta(file);
}
}
fclose(file);
fclose(dst_file);
}

View File

@ -7,14 +7,14 @@
#include <sys/syscall.h>
#include <glib.h>
typedef void(*index_func)(cJSON *, const char[UUID_STR_LEN]);
typedef void(*index_func)(cJSON *, const char[MD5_STR_LENGTH]);
void incremental_copy(store_t *store, store_t *dst_store, const char *filepath,
const char *dst_filepath, GHashTable *copy_table);
void write_document(document_t *doc);
void read_index(const char *path, const char[UUID_STR_LEN], const char *type, index_func);
void read_index(const char *path, const char[MD5_STR_LENGTH], const char *type, index_func);
void incremental_read(GHashTable *table, const char *filepath);

View File

@ -40,12 +40,20 @@ void store_destroy(store_t *store) {
free(store);
}
void store_flush(store_t *store) {
mdb_env_sync(store->env, TRUE);
}
void store_write(store_t *store, char *key, size_t key_len, char *buf, size_t buf_len) {
if (LogCtx.very_verbose) {
char uuid_str[UUID_STR_LEN];
uuid_unparse((unsigned char *) key, uuid_str);
LOG_DEBUGF("store.c", "Store write {%s} %lu bytes", uuid_str, buf_len)
if (key_len == MD5_DIGEST_LENGTH) {
char path_md5_str[MD5_STR_LENGTH];
buf2hex((unsigned char *) key, MD5_DIGEST_LENGTH, path_md5_str);
LOG_DEBUGF("store.c", "Store write {%s} %lu bytes", path_md5_str, buf_len)
} else {
LOG_DEBUGF("store.c", "Store write {%s} %lu bytes", key, buf_len)
}
}
MDB_val mdb_key;
@ -136,7 +144,9 @@ GHashTable *store_read_all(store_t *store) {
count += 1;
}
LOG_DEBUGF("store.c", "Read tags for %d documents", count);
const char *path;
mdb_env_get_path(store->env, &path);
LOG_DEBUGF("store.c", "Read %d entries from %s", count, path);
mdb_cursor_close(cur);
mdb_txn_abort(txn);

View File

@ -8,6 +8,7 @@
#define STORE_SIZE_TN 1024 * 1024 * 5
#define STORE_SIZE_TAG 1024 * 16
#define STORE_SIZE_META STORE_SIZE_TAG
typedef struct store_t {
MDB_dbi dbi;
@ -23,6 +24,8 @@ void store_destroy(store_t *store);
void store_write(store_t *store, char *key, size_t key_len, char *buf, size_t buf_len);
void store_flush(store_t *store);
char *store_read(store_t *store, char *key, size_t key_len, size_t *ret_vallen);
GHashTable *store_read_all(store_t *store);

View File

@ -20,7 +20,7 @@ parse_job_t *create_fs_parse_job(const char *filepath, const struct stat *info,
job->vfile.info = *info;
memset(job->parent, 0, 16);
memset(job->parent, 0, MD5_DIGEST_LENGTH);
job->vfile.filepath = job->filepath;
job->vfile.read = fs_read;

View File

@ -21,7 +21,7 @@
#define EPILOG "Made by simon987 <me@simon987.net>. Released under GPL-3.0"
static const char *const Version = "2.8.4";
static const char *const Version = "2.9.0";
static const char *const usage[] = {
"sist2 scan [OPTION]... PATH",
"sist2 index [OPTION]... INDEX",
@ -34,9 +34,10 @@ void init_dir(const char *dirpath) {
char path[PATH_MAX];
snprintf(path, PATH_MAX, "%sdescriptor.json", dirpath);
uuid_t uuid;
uuid_generate(uuid);
uuid_unparse(uuid, ScanCtx.index.desc.uuid);
unsigned char index_md5[MD5_DIGEST_LENGTH];
MD5((unsigned char *) ScanCtx.index.desc.name, strlen(ScanCtx.index.desc.name), index_md5);
buf2hex(index_md5, MD5_DIGEST_LENGTH, ScanCtx.index.desc.id);
time(&ScanCtx.index.desc.timestamp);
strcpy(ScanCtx.index.desc.version, Version);
strcpy(ScanCtx.index.desc.type, INDEX_TYPE_BIN);
@ -149,6 +150,14 @@ void initialize_scan_context(scan_args_t *args) {
ScanCtx.text_ctx.log = _log;
ScanCtx.text_ctx.logf = _logf;
// MSDOC
ScanCtx.msdoc_ctx.tn_size = args->size;
ScanCtx.msdoc_ctx.content_size = args->content_size;
ScanCtx.msdoc_ctx.log = _log;
ScanCtx.msdoc_ctx.logf = _logf;
ScanCtx.msdoc_ctx.store = _store;
ScanCtx.msdoc_ctx.msdoc_mime = mime_get_mime_by_string(ScanCtx.mime_table, "application/msword");
ScanCtx.threads = args->threads;
ScanCtx.depth = args->depth;
@ -182,6 +191,10 @@ void sist2_scan(scan_args_t *args) {
mkdir(store_path, S_IWUSR | S_IRUSR | S_IXUSR);
ScanCtx.index.store = store_create(store_path, STORE_SIZE_TN);
snprintf(store_path, PATH_MAX, "%smeta", ScanCtx.index.path);
mkdir(store_path, S_IWUSR | S_IRUSR | S_IXUSR);
ScanCtx.index.meta_store = store_create(store_path, STORE_SIZE_META);
scan_print_header();
if (args->incremental != NULL) {
@ -206,7 +219,7 @@ void sist2_scan(scan_args_t *args) {
while ((de = readdir(dir)) != NULL) {
if (strncmp(de->d_name, "_index_", sizeof("_index_") - 1) == 0) {
char file_path[PATH_MAX];
snprintf(file_path, PATH_MAX, "%s/%s", args->incremental, de->d_name);
snprintf(file_path, PATH_MAX, "%s%s", args->incremental, de->d_name);
incremental_read(ScanCtx.original_table, file_path);
}
}
@ -221,8 +234,6 @@ void sist2_scan(scan_args_t *args) {
tpool_wait(ScanCtx.pool);
tpool_destroy(ScanCtx.pool);
generate_stats(&ScanCtx.index, args->treemap_threshold, ScanCtx.index.path);
if (args->incremental != NULL) {
char dst_path[PATH_MAX];
snprintf(store_path, PATH_MAX, "%sthumbs", args->incremental);
@ -238,7 +249,7 @@ void sist2_scan(scan_args_t *args) {
while ((de = readdir(dir)) != NULL) {
if (strncmp(de->d_name, "_index_", sizeof("_index_") - 1) == 0) {
char file_path[PATH_MAX];
snprintf(file_path, PATH_MAX, "%s/%s", args->incremental, de->d_name);
snprintf(file_path, PATH_MAX, "%s%s", args->incremental, de->d_name);
incremental_copy(source, ScanCtx.index.store, file_path, dst_path, ScanCtx.copy_table);
}
}
@ -253,6 +264,8 @@ void sist2_scan(scan_args_t *args) {
store_destroy(source_tags);
}
generate_stats(&ScanCtx.index, args->treemap_threshold, ScanCtx.index.path);
store_destroy(ScanCtx.index.store);
}
@ -263,7 +276,7 @@ void sist2_index(index_args_t *args) {
IndexCtx.batch_size = args->batch_size;
if (!args->print) {
elastic_init(args->force_reset);
elastic_init(args->force_reset, args->es_mappings, args->es_settings);
}
char descriptor_path[PATH_MAX];
@ -289,6 +302,10 @@ void sist2_index(index_args_t *args) {
IndexCtx.tag_store = store_create(path_tmp, STORE_SIZE_TAG);
IndexCtx.tags = store_read_all(IndexCtx.tag_store);
snprintf(path_tmp, sizeof(path_tmp), "%s/meta", args->index_path);
IndexCtx.meta_store = store_create(path_tmp, STORE_SIZE_META);
IndexCtx.meta = store_read_all(IndexCtx.meta_store);
index_func f;
if (args->print) {
f = print_json;
@ -311,7 +328,7 @@ void sist2_index(index_args_t *args) {
if (strncmp(de->d_name, "_index_", sizeof("_index_") - 1) == 0) {
char file_path[PATH_MAX];
snprintf(file_path, PATH_MAX, "%s/%s", args->index_path, de->d_name);
read_index(file_path, desc.uuid, desc.type, f);
read_index(file_path, desc.id, desc.type, f);
}
}
closedir(dir);
@ -321,7 +338,7 @@ void sist2_index(index_args_t *args) {
tpool_destroy(IndexCtx.pool);
if (!args->print) {
finish_indexer(args->script, args->async_script, desc.uuid);
finish_indexer(args->script, args->async_script, desc.id);
}
store_destroy(IndexCtx.tag_store);
@ -341,7 +358,7 @@ void sist2_exec_script(exec_args_t *args) {
LOG_DEBUGF("main.c", "descriptor version %s (%s)", desc.version, desc.type)
execute_update_script(args->script, args->async_script, desc.uuid);
execute_update_script(args->script, args->async_script, desc.id);
free(args->script);
}
@ -438,6 +455,8 @@ int main(int argc, const char *argv[]) {
OPT_STRING(0, "es-index", &common_es_index, "Elasticsearch index name. DEFAULT=sist2"),
OPT_BOOLEAN('p', "print", &index_args->print, "Just print JSON documents to stdout."),
OPT_STRING(0, "script-file", &common_script_path, "Path to user script."),
OPT_STRING(0, "mappings-file", &index_args->es_mappings_path, "Path to Elasticsearch mappings."),
OPT_STRING(0, "settings-file", &index_args->es_settings_path, "Path to Elasticsearch settings."),
OPT_BOOLEAN(0, "async-script", &common_async_script, "Execute user script asynchronously."),
OPT_INTEGER(0, "batch-size", &index_args->batch_size, "Index batch size. DEFAULT: 100"),
OPT_BOOLEAN('f', "force-reset", &index_args->force_reset, "Reset Elasticsearch mappings and settings. "
@ -515,7 +534,7 @@ int main(int argc, const char *argv[]) {
}
sist2_web(web_args);
} else if (strcmp(argv[0], "exec-script") == 0) {
} else if (strcmp(argv[0], "exec-script") == 0) {
int err = exec_args_validate(exec_args, argc, argv);
if (err != 0) {

View File

@ -6,6 +6,7 @@
#define MAJOR_MIME(mime_id) (mime_id & 0x000F0000) >> 16
#define MIME_EMPTY 1
#define MIME_SIST2_SIDECAR 2
#define DONT_PARSE 0x80000000
#define SHOULD_PARSE(mime_id) (ScanCtx.fast == 0 && (mime_id & DONT_PARSE) != DONT_PARSE && mime_id != 0)

View File

@ -128,333 +128,334 @@ enum mime {
application_x_dvi=655480,
application_x_elc=655481,
application_x_empty=1,
application_x_envoy=655483,
application_x_esrehber=655484,
application_x_excel=655485,
application_x_executable=655486,
application_x_font_gdos=655487,
application_x_font_pf2=655488,
application_x_font_pfm=655489,
application_x_font_sfn=655490,
application_x_font_ttf=655491 | 0x20000000,
application_x_fptapplication_x_dbt=655492,
application_x_freelance=655493,
application_x_gamecube_rom=655494,
application_x_gdbm=655495,
application_x_gettext_translation=655496,
application_x_git=655497,
application_x_gsp=655498,
application_x_gss=655499,
application_x_gtar=655500,
application_x_gzip=655501,
application_x_hdf=655502,
application_x_helpfile=655503,
application_x_httpd_imap=655504,
application_x_ima=655505,
application_x_innosetup=655506,
application_x_internett_signup=655507,
application_x_inventor=655508,
application_x_ip2=655509,
application_x_java_applet=655510,
application_x_java_commerce=655511,
application_x_java_image=655512,
application_x_java_jmod=655513,
application_x_java_keystore=655514,
application_x_kdelnk=655515,
application_x_koan=655516,
application_x_latex=655517,
application_x_livescreen=655518,
application_x_lotus=655519,
application_x_lz4=655520 | 0x08000000,
application_x_lz4_json=655521,
application_x_lzh=655522,
application_x_lzh_compressed=655523,
application_x_lzip=655524 | 0x08000000,
application_x_lzma=655525 | 0x08000000,
application_x_lzop=655526 | 0x08000000,
application_x_lzx=655527,
application_x_mach_binary=655528,
application_x_mach_executable=655529,
application_x_magic_cap_package_1_0=655530,
application_x_mathcad=655531,
application_x_maxis_dbpf=655532,
application_x_meme=655533,
application_x_midi=655534,
application_x_mif=655535,
application_x_mix_transfer=655536,
application_x_mobipocket_ebook=655537 | 0x02000000,
application_x_ms_compress_szdd=655538,
application_x_ms_pdb=655539,
application_x_ms_reader=655540,
application_x_msaccess=655541,
application_x_n64_rom=655542,
application_x_navi_animation=655543,
application_x_navidoc=655544,
application_x_navimap=655545,
application_x_navistyle=655546,
application_x_nes_rom=655547,
application_x_netcdf=655548,
application_x_newton_compatible_pkg=655549,
application_x_nintendo_ds_rom=655550,
application_x_object=655551,
application_x_omc=655552,
application_x_omcdatamaker=655553,
application_x_omcregerator=655554,
application_x_pagemaker=655555,
application_x_pcl=655556,
application_x_pgp_keyring=655557,
application_x_pixclscript=655558,
application_x_pkcs7_certreqresp=655559,
application_x_pkcs7_signature=655560,
application_x_project=655561,
application_x_qpro=655562,
application_x_rar=655563 | 0x10000000,
application_x_rpm=655564,
application_x_sdp=655565,
application_x_sea=655566,
application_x_seelogo=655567,
application_x_setupscript=655568,
application_x_shar=655569,
application_x_sharedlib=655570,
application_x_shockwave_flash=655571,
application_x_snappy_framed=655572,
application_x_sprite=655573,
application_x_sqlite3=655574,
application_x_stargallery_thm=655575,
application_x_stuffit=655576,
application_x_sv4cpio=655577,
application_x_sv4crc=655578,
application_x_tar=655579 | 0x10000000,
application_x_tbook=655580,
application_x_terminfo=655581,
application_x_terminfo2=655582,
application_x_tex_tfm=655583,
application_x_texinfo=655584,
application_x_ustar=655585,
application_x_visio=655586,
application_x_vnd_audioexplosion_mzz=655587,
application_x_vnd_ls_xpix=655588,
application_x_vrml=655589,
application_x_wais_source=655590,
application_x_wine_extension_ini=655591,
application_x_wintalk=655592,
application_x_world=655593,
application_x_wri=655594,
application_x_x509_ca_cert=655595,
application_x_xz=655596 | 0x08000000,
application_x_zip=655597,
application_x_zstd=655598 | 0x08000000,
application_x_zstd_dictionary=655599,
application_xml=655600,
application_zip=655601 | 0x10000000,
application_zlib=655602,
audio_basic=458995 | 0x80000000,
audio_it=458996,
audio_make=458997,
audio_mid=458998,
audio_midi=458999,
audio_mp4=459000,
audio_mpeg=459001,
audio_ogg=459002,
audio_s3m=459003,
audio_tsp_audio=459004,
audio_tsplayer=459005,
audio_vnd_qcelp=459006,
audio_voxware=459007,
audio_x_aiff=459008,
audio_x_flac=459009,
audio_x_gsm=459010,
audio_x_hx_aac_adts=459011,
audio_x_jam=459012,
audio_x_liveaudio=459013,
audio_x_m4a=459014,
audio_x_midi=459015,
audio_x_mod=459016,
audio_x_mp4a_latm=459017,
audio_x_mpeg_3=459018,
audio_x_mpequrl=459019,
audio_x_nspaudio=459020,
audio_x_pn_realaudio=459021,
audio_x_psid=459022,
audio_x_realaudio=459023,
audio_x_s3m=459024,
audio_x_twinvq=459025,
audio_x_twinvq_plugin=459026,
audio_x_voc=459027,
audio_x_wav=459028,
audio_x_xbox_executable=459029 | 0x80000000,
audio_x_xbox360_executable=459030 | 0x80000000,
audio_xm=459031,
font_otf=327960 | 0x20000000,
font_sfnt=327961 | 0x20000000,
font_woff=327962 | 0x20000000,
font_woff2=327963 | 0x20000000,
image_bmp=524572,
image_cmu_raster=524573,
image_fif=524574,
image_florian=524575,
image_g3fax=524576,
image_gif=524577,
image_heic=524578,
image_ief=524579,
image_jpeg=524580,
image_jutvision=524581,
image_naplps=524582,
image_pict=524583,
image_png=524584,
image_svg=524585 | 0x80000000,
image_svg_xml=524586 | 0x80000000,
image_tiff=524587,
image_vnd_adobe_photoshop=524588 | 0x80000000,
image_vnd_djvu=524589 | 0x80000000,
image_vnd_fpx=524590,
image_vnd_microsoft_icon=524591,
image_vnd_rn_realflash=524592,
image_vnd_rn_realpix=524593,
image_vnd_wap_wbmp=524594,
image_vnd_xiff=524595,
image_webp=524596,
image_wmf=524597,
image_x_3ds=524598,
image_x_adobe_dng=524599 | 0x00800000,
image_x_award_bioslogo=524600,
image_x_canon_cr2=524601 | 0x00800000,
image_x_canon_crw=524602 | 0x00800000,
image_x_cmu_raster=524603,
image_x_cur=524604,
image_x_dcraw=524605 | 0x00800000,
image_x_dwg=524606,
image_x_eps=524607,
image_x_epson_erf=524608 | 0x00800000,
image_x_exr=524609,
image_x_fuji_raf=524610 | 0x00800000,
image_x_gem=524611,
image_x_icns=524612,
image_x_icon=524613 | 0x80000000,
image_x_jg=524614,
image_x_jps=524615,
image_x_kodak_dcr=524616 | 0x00800000,
image_x_kodak_k25=524617 | 0x00800000,
image_x_kodak_kdc=524618 | 0x00800000,
image_x_minolta_mrw=524619 | 0x00800000,
image_x_ms_bmp=524620,
image_x_niff=524621,
image_x_nikon_nef=524622 | 0x00800000,
image_x_olympus_orf=524623 | 0x00800000,
image_x_panasonic_raw=524624 | 0x00800000,
image_x_pcx=524625,
image_x_pentax_pef=524626 | 0x00800000,
image_x_pict=524627,
image_x_portable_bitmap=524628,
image_x_portable_graymap=524629,
image_x_portable_pixmap=524630,
image_x_quicktime=524631,
image_x_rgb=524632,
image_x_sigma_x3f=524633 | 0x00800000,
image_x_sony_arw=524634 | 0x00800000,
image_x_sony_sr2=524635 | 0x00800000,
image_x_sony_srf=524636 | 0x00800000,
image_x_tga=524637,
image_x_tiff=524638,
image_x_win_bitmap=524639,
image_x_xcf=524640 | 0x80000000,
image_x_xpixmap=524641 | 0x80000000,
image_x_xwindowdump=524642,
message_news=196963,
message_rfc822=196964,
model_vnd_dwf=65893,
model_vnd_gdl=65894,
model_vnd_gs_gdl=65895,
model_vrml=65896,
model_x_pov=65897,
text_PGP=590186,
text_asp=590187,
text_css=590188,
text_html=590189 | 0x01000000,
text_javascript=590190,
text_mcf=590191,
text_pascal=590192,
text_plain=590193,
text_richtext=590194,
text_rtf=590195,
text_scriplet=590196,
text_tab_separated_values=590197,
text_troff=590198,
text_uri_list=590199,
text_vnd_abc=590200,
text_vnd_fmi_flexstor=590201,
text_vnd_wap_wml=590202,
text_vnd_wap_wmlscript=590203,
text_webviewhtml=590204,
text_x_Algol68=590205,
text_x_asm=590206,
text_x_audiosoft_intra=590207,
text_x_awk=590208,
text_x_bcpl=590209,
text_x_c=590210,
text_x_c__=590211,
text_x_component=590212,
text_x_diff=590213,
text_x_fortran=590214,
text_x_java=590215,
text_x_la_asf=590216,
text_x_lisp=590217,
text_x_m=590218,
text_x_m4=590219,
text_x_makefile=590220,
text_x_ms_regedit=590221,
text_x_msdos_batch=590222,
text_x_objective_c=590223,
text_x_pascal=590224,
text_x_perl=590225,
text_x_php=590226,
text_x_po=590227,
text_x_python=590228,
text_x_ruby=590229,
text_x_sass=590230,
text_x_scss=590231,
text_x_server_parsed_html=590232,
text_x_setext=590233,
text_x_sgml=590234 | 0x01000000,
text_x_shellscript=590235,
text_x_speech=590236,
text_x_tcl=590237,
text_x_tex=590238,
text_x_uil=590239,
text_x_uuencode=590240,
text_x_vcalendar=590241,
text_x_vcard=590242,
text_xml=590243 | 0x01000000,
video_MP2T=393636,
video_animaflex=393637,
video_avi=393638,
video_avs_video=393639,
video_mp4=393640,
video_mpeg=393641,
video_quicktime=393642,
video_vdo=393643,
video_vivo=393644,
video_vnd_rn_realvideo=393645,
video_vosaic=393646,
video_webm=393647,
video_x_amt_demorun=393648,
video_x_amt_showrun=393649,
video_x_atomic3d_feature=393650,
video_x_dl=393651,
video_x_dv=393652,
video_x_fli=393653,
video_x_flv=393654,
video_x_isvideo=393655,
video_x_jng=393656 | 0x80000000,
video_x_m4v=393657,
video_x_matroska=393658,
video_x_mng=393659,
video_x_motion_jpeg=393660,
video_x_ms_asf=393661,
video_x_msvideo=393662,
video_x_qtc=393663,
video_x_sgi_movie=393664,
x_epoc_x_sisx_app=721345,
application_x_envoy=655482,
application_x_esrehber=655483,
application_x_excel=655484,
application_x_executable=655485,
application_x_font_gdos=655486,
application_x_font_pf2=655487,
application_x_font_pfm=655488,
application_x_font_sfn=655489,
application_x_font_ttf=655490 | 0x20000000,
application_x_fptapplication_x_dbt=655491,
application_x_freelance=655492,
application_x_gamecube_rom=655493,
application_x_gdbm=655494,
application_x_gettext_translation=655495,
application_x_git=655496,
application_x_gsp=655497,
application_x_gss=655498,
application_x_gtar=655499,
application_x_gzip=655500,
application_x_hdf=655501,
application_x_helpfile=655502,
application_x_httpd_imap=655503,
application_x_ima=655504,
application_x_innosetup=655505,
application_x_internett_signup=655506,
application_x_inventor=655507,
application_x_ip2=655508,
application_x_java_applet=655509,
application_x_java_commerce=655510,
application_x_java_image=655511,
application_x_java_jmod=655512,
application_x_java_keystore=655513,
application_x_kdelnk=655514,
application_x_koan=655515,
application_x_latex=655516,
application_x_livescreen=655517,
application_x_lotus=655518,
application_x_lz4=655519 | 0x08000000,
application_x_lz4_json=655520,
application_x_lzh=655521,
application_x_lzh_compressed=655522,
application_x_lzip=655523 | 0x08000000,
application_x_lzma=655524 | 0x08000000,
application_x_lzop=655525 | 0x08000000,
application_x_lzx=655526,
application_x_mach_binary=655527,
application_x_mach_executable=655528,
application_x_magic_cap_package_1_0=655529,
application_x_mathcad=655530,
application_x_maxis_dbpf=655531,
application_x_meme=655532,
application_x_midi=655533,
application_x_mif=655534,
application_x_mix_transfer=655535,
application_x_mobipocket_ebook=655536 | 0x02000000,
application_x_ms_compress_szdd=655537,
application_x_ms_pdb=655538,
application_x_ms_reader=655539,
application_x_msaccess=655540,
application_x_n64_rom=655541,
application_x_navi_animation=655542,
application_x_navidoc=655543,
application_x_navimap=655544,
application_x_navistyle=655545,
application_x_nes_rom=655546,
application_x_netcdf=655547,
application_x_newton_compatible_pkg=655548,
application_x_nintendo_ds_rom=655549,
application_x_object=655550,
application_x_omc=655551,
application_x_omcdatamaker=655552,
application_x_omcregerator=655553,
application_x_pagemaker=655554,
application_x_pcl=655555,
application_x_pgp_keyring=655556,
application_x_pixclscript=655557,
application_x_pkcs7_certreqresp=655558,
application_x_pkcs7_signature=655559,
application_x_project=655560,
application_x_qpro=655561,
application_x_rar=655562 | 0x10000000,
application_x_rpm=655563,
application_x_sdp=655564,
application_x_sea=655565,
application_x_seelogo=655566,
application_x_setupscript=655567,
application_x_shar=655568,
application_x_sharedlib=655569,
application_x_shockwave_flash=655570,
application_x_snappy_framed=655571,
application_x_sprite=655572,
application_x_sqlite3=655573,
application_x_stargallery_thm=655574,
application_x_stuffit=655575,
application_x_sv4cpio=655576,
application_x_sv4crc=655577,
application_x_tar=655578 | 0x10000000,
application_x_tbook=655579,
application_x_terminfo=655580,
application_x_terminfo2=655581,
application_x_tex_tfm=655582,
application_x_texinfo=655583,
application_x_ustar=655584,
application_x_visio=655585,
application_x_vnd_audioexplosion_mzz=655586,
application_x_vnd_ls_xpix=655587,
application_x_vrml=655588,
application_x_wais_source=655589,
application_x_wine_extension_ini=655590,
application_x_wintalk=655591,
application_x_world=655592,
application_x_wri=655593,
application_x_x509_ca_cert=655594,
application_x_xz=655595 | 0x08000000,
application_x_zip=655596,
application_x_zstd=655597 | 0x08000000,
application_x_zstd_dictionary=655598,
application_xml=655599,
application_zip=655600 | 0x10000000,
application_zlib=655601,
audio_basic=458994 | 0x80000000,
audio_it=458995,
audio_make=458996,
audio_mid=458997,
audio_midi=458998,
audio_mp4=458999,
audio_mpeg=459000,
audio_ogg=459001,
audio_s3m=459002,
audio_tsp_audio=459003,
audio_tsplayer=459004,
audio_vnd_qcelp=459005,
audio_voxware=459006,
audio_x_aiff=459007,
audio_x_flac=459008,
audio_x_gsm=459009,
audio_x_hx_aac_adts=459010,
audio_x_jam=459011,
audio_x_liveaudio=459012,
audio_x_m4a=459013,
audio_x_midi=459014,
audio_x_mod=459015,
audio_x_mp4a_latm=459016,
audio_x_mpeg_3=459017,
audio_x_mpequrl=459018,
audio_x_nspaudio=459019,
audio_x_pn_realaudio=459020,
audio_x_psid=459021,
audio_x_realaudio=459022,
audio_x_s3m=459023,
audio_x_twinvq=459024,
audio_x_twinvq_plugin=459025,
audio_x_voc=459026,
audio_x_wav=459027,
audio_x_xbox_executable=459028 | 0x80000000,
audio_x_xbox360_executable=459029 | 0x80000000,
audio_xm=459030,
font_otf=327959 | 0x20000000,
font_sfnt=327960 | 0x20000000,
font_woff=327961 | 0x20000000,
font_woff2=327962 | 0x20000000,
image_bmp=524571,
image_cmu_raster=524572,
image_fif=524573,
image_florian=524574,
image_g3fax=524575,
image_gif=524576,
image_heic=524577,
image_ief=524578,
image_jpeg=524579,
image_jutvision=524580,
image_naplps=524581,
image_pict=524582,
image_png=524583,
image_svg=524584 | 0x80000000,
image_svg_xml=524585 | 0x80000000,
image_tiff=524586,
image_vnd_adobe_photoshop=524587 | 0x80000000,
image_vnd_djvu=524588 | 0x80000000,
image_vnd_fpx=524589,
image_vnd_microsoft_icon=524590,
image_vnd_rn_realflash=524591,
image_vnd_rn_realpix=524592,
image_vnd_wap_wbmp=524593,
image_vnd_xiff=524594,
image_webp=524595,
image_wmf=524596,
image_x_3ds=524597,
image_x_adobe_dng=524598 | 0x00800000,
image_x_award_bioslogo=524599,
image_x_canon_cr2=524600 | 0x00800000,
image_x_canon_crw=524601 | 0x00800000,
image_x_cmu_raster=524602,
image_x_cur=524603,
image_x_dcraw=524604 | 0x00800000,
image_x_dwg=524605,
image_x_eps=524606,
image_x_epson_erf=524607 | 0x00800000,
image_x_exr=524608,
image_x_fuji_raf=524609 | 0x00800000,
image_x_gem=524610,
image_x_icns=524611,
image_x_icon=524612 | 0x80000000,
image_x_jg=524613,
image_x_jps=524614,
image_x_kodak_dcr=524615 | 0x00800000,
image_x_kodak_k25=524616 | 0x00800000,
image_x_kodak_kdc=524617 | 0x00800000,
image_x_minolta_mrw=524618 | 0x00800000,
image_x_ms_bmp=524619,
image_x_niff=524620,
image_x_nikon_nef=524621 | 0x00800000,
image_x_olympus_orf=524622 | 0x00800000,
image_x_panasonic_raw=524623 | 0x00800000,
image_x_pcx=524624,
image_x_pentax_pef=524625 | 0x00800000,
image_x_pict=524626,
image_x_portable_bitmap=524627,
image_x_portable_graymap=524628,
image_x_portable_pixmap=524629,
image_x_quicktime=524630,
image_x_rgb=524631,
image_x_sigma_x3f=524632 | 0x00800000,
image_x_sony_arw=524633 | 0x00800000,
image_x_sony_sr2=524634 | 0x00800000,
image_x_sony_srf=524635 | 0x00800000,
image_x_tga=524636,
image_x_tiff=524637,
image_x_win_bitmap=524638,
image_x_xcf=524639 | 0x80000000,
image_x_xpixmap=524640 | 0x80000000,
image_x_xwindowdump=524641,
message_news=196962,
message_rfc822=196963,
model_vnd_dwf=65892,
model_vnd_gdl=65893,
model_vnd_gs_gdl=65894,
model_vrml=65895,
model_x_pov=65896,
sist2_sidecar=2,
text_PGP=590185,
text_asp=590186,
text_css=590187,
text_html=590188 | 0x01000000,
text_javascript=590189,
text_mcf=590190,
text_pascal=590191,
text_plain=590192,
text_richtext=590193,
text_rtf=590194,
text_scriplet=590195,
text_tab_separated_values=590196,
text_troff=590197,
text_uri_list=590198,
text_vnd_abc=590199,
text_vnd_fmi_flexstor=590200,
text_vnd_wap_wml=590201,
text_vnd_wap_wmlscript=590202,
text_webviewhtml=590203,
text_x_Algol68=590204,
text_x_asm=590205,
text_x_audiosoft_intra=590206,
text_x_awk=590207,
text_x_bcpl=590208,
text_x_c=590209,
text_x_c__=590210,
text_x_component=590211,
text_x_diff=590212,
text_x_fortran=590213,
text_x_java=590214,
text_x_la_asf=590215,
text_x_lisp=590216,
text_x_m=590217,
text_x_m4=590218,
text_x_makefile=590219,
text_x_ms_regedit=590220,
text_x_msdos_batch=590221,
text_x_objective_c=590222,
text_x_pascal=590223,
text_x_perl=590224,
text_x_php=590225,
text_x_po=590226,
text_x_python=590227,
text_x_ruby=590228,
text_x_sass=590229,
text_x_scss=590230,
text_x_server_parsed_html=590231,
text_x_setext=590232,
text_x_sgml=590233 | 0x01000000,
text_x_shellscript=590234,
text_x_speech=590235,
text_x_tcl=590236,
text_x_tex=590237,
text_x_uil=590238,
text_x_uuencode=590239,
text_x_vcalendar=590240,
text_x_vcard=590241,
text_xml=590242 | 0x01000000,
video_MP2T=393635,
video_animaflex=393636,
video_avi=393637,
video_avs_video=393638,
video_mp4=393639,
video_mpeg=393640,
video_quicktime=393641,
video_vdo=393642,
video_vivo=393643,
video_vnd_rn_realvideo=393644,
video_vosaic=393645,
video_webm=393646,
video_x_amt_demorun=393647,
video_x_amt_showrun=393648,
video_x_atomic3d_feature=393649,
video_x_dl=393650,
video_x_dv=393651,
video_x_fli=393652,
video_x_flv=393653,
video_x_isvideo=393654,
video_x_jng=393655 | 0x80000000,
video_x_m4v=393656,
video_x_matroska=393657,
video_x_mng=393658,
video_x_motion_jpeg=393659,
video_x_ms_asf=393660,
video_x_msvideo=393661,
video_x_qtc=393662,
video_x_sgi_movie=393663,
x_epoc_x_sisx_app=721344,
};
char *mime_get_mime_text(unsigned int mime_id) {switch (mime_id) {
case application_arj: return "application/arj";
@ -906,6 +907,7 @@ case image_x_sony_arw: return "image/x-sony-arw";
case image_x_sony_sr2: return "image/x-sony-sr2";
case image_x_sony_srf: return "image/x-sony-srf";
case image_x_epson_erf: return "image/x-epson-erf";
case sist2_sidecar: return "sist2/sidecar";
default: return NULL;}}
GHashTable *mime_get_ext_table() {GHashTable *ext_table = g_hash_table_new(g_str_hash, g_str_equal);
g_hash_table_insert(ext_table, "arj", (gpointer)application_arj);
@ -1449,6 +1451,7 @@ g_hash_table_insert(ext_table, "arw", (gpointer)image_x_sony_arw);
g_hash_table_insert(ext_table, "sr2", (gpointer)image_x_sony_sr2);
g_hash_table_insert(ext_table, "srf", (gpointer)image_x_sony_srf);
g_hash_table_insert(ext_table, "erf", (gpointer)image_x_epson_erf);
g_hash_table_insert(ext_table, "s2meta", (gpointer)sist2_sidecar);
return ext_table;}
GHashTable *mime_get_mime_table() {GHashTable *mime_table = g_hash_table_new(g_str_hash, g_str_equal);
g_hash_table_insert(mime_table, "application/arj", (gpointer)application_arj);
@ -1900,5 +1903,6 @@ g_hash_table_insert(mime_table, "image/x-sony-arw", (gpointer)image_x_sony_arw);
g_hash_table_insert(mime_table, "image/x-sony-sr2", (gpointer)image_x_sony_sr2);
g_hash_table_insert(mime_table, "image/x-sony-srf", (gpointer)image_x_sony_srf);
g_hash_table_insert(mime_table, "image/x-epson-erf", (gpointer)image_x_epson_erf);
g_hash_table_insert(mime_table, "sist2/sidecar", (gpointer)sist2_sidecar);
return mime_table;}
#endif

View File

@ -4,6 +4,7 @@
#include "src/ctx.h"
#include "mime.h"
#include "src/io/serialize.h"
#include "src/parsing/sidecar.h"
#include <magic.h>
@ -45,29 +46,31 @@ void parse(void *arg) {
parse_job_t *job = arg;
document_t doc;
int inc_ts = incremental_get(ScanCtx.original_table, job->vfile.info.st_ino);
if (inc_ts != 0 && inc_ts == job->vfile.info.st_mtim.tv_sec) {
incremental_mark_file_for_copy(ScanCtx.copy_table, job->vfile.info.st_ino);
return;
}
doc.filepath = job->filepath;
doc.ext = (short) job->ext;
doc.base = (short) job->base;
char *rel_path = doc.filepath + ScanCtx.index.desc.root_len;
MD5((unsigned char *) rel_path, strlen(rel_path), doc.path_md5);
doc.meta_head = NULL;
doc.meta_tail = NULL;
doc.mime = 0;
doc.size = job->vfile.info.st_size;
doc.ino = job->vfile.info.st_ino;
doc.mtime = job->vfile.info.st_mtim.tv_sec;
uuid_generate(doc.uuid);
int inc_ts = incremental_get(ScanCtx.original_table, doc.path_md5);
if (inc_ts != 0 && inc_ts == job->vfile.info.st_mtim.tv_sec) {
incremental_mark_file_for_copy(ScanCtx.copy_table, doc.path_md5);
return;
}
char *buf[MAGIC_BUF_SIZE];
if (LogCtx.very_verbose) {
char uuid_str[UUID_STR_LEN];
uuid_unparse(doc.uuid, uuid_str);
LOG_DEBUGF(job->filepath, "Starting parse job {%s}", uuid_str)
char path_md5_str[MD5_STR_LENGTH];
buf2hex(doc.path_md5, MD5_DIGEST_LENGTH, path_md5_str);
LOG_DEBUGF(job->filepath, "Starting parse job {%s}", path_md5_str)
}
if (job->vfile.info.st_size == 0) {
@ -85,7 +88,8 @@ void parse(void *arg) {
// Get mime type with libmagic
if (!job->vfile.is_fs_file) {
LOG_WARNING(job->filepath, "Guessing mime type with libmagic inside archive files is not currently supported");
LOG_WARNING(job->filepath,
"Guessing mime type with libmagic inside archive files is not currently supported");
goto abort;
}
@ -157,16 +161,26 @@ void parse(void *arg) {
parse_comic(&ScanCtx.comic_ctx, &job->vfile, &doc);
} else if (IS_MOBI(doc.mime)) {
parse_mobi(&ScanCtx.mobi_ctx, &job->vfile, &doc);
} else if (doc.mime == MIME_SIST2_SIDECAR) {
parse_sidecar(&job->vfile, &doc);
CLOSE_FILE(job->vfile)
return;
} else if (is_msdoc(&ScanCtx.msdoc_ctx, doc.mime)) {
parse_msdoc(&ScanCtx.msdoc_ctx, &job->vfile, &doc);
}
abort:
//Parent meta
if (!uuid_is_null(job->parent)) {
meta_line_t *meta_parent = malloc(sizeof(meta_line_t) + UUID_STR_LEN + 1);
if (!md5_digest_is_null(job->parent)) {
meta_line_t *meta_parent = malloc(sizeof(meta_line_t) + MD5_STR_LENGTH);
meta_parent->key = MetaParent;
uuid_unparse(job->parent, meta_parent->str_val);
buf2hex(job->parent, MD5_DIGEST_LENGTH, meta_parent->str_val);
APPEND_META((&doc), meta_parent)
doc.has_parent = TRUE;
} else {
doc.has_parent = FALSE;
}
write_document(&doc);

35
src/parsing/sidecar.c Normal file
View File

@ -0,0 +1,35 @@
#include "sidecar.h"
#include "src/ctx.h"
void parse_sidecar(vfile_t *vfile, document_t *doc) {
LOG_DEBUGF("sidecar.c", "Parsing sidecar file %s", vfile->filepath)
size_t size;
char *buf = read_all(vfile, &size);
if (buf == NULL) {
LOG_ERRORF("sidecar.c", "Read error for %s", vfile->filepath)
return;
}
buf = realloc(buf, size + 1);
*(buf + size) = '\0';
cJSON *json = cJSON_Parse(buf);
if (json == NULL) {
LOG_ERRORF("sidecar.c", "Could not parse JSON sidecar %s", vfile->filepath)
return;
}
char *json_str = cJSON_PrintUnformatted(json);
unsigned char path_md5[MD5_DIGEST_LENGTH];
MD5((unsigned char *) vfile->filepath + ScanCtx.index.desc.root_len, doc->ext - 1 - ScanCtx.index.desc.root_len,
path_md5);
store_write(ScanCtx.index.meta_store, (char *) path_md5, sizeof(path_md5), json_str, strlen(json_str) + 1);
cJSON_Delete(json);
free(json_str);
free(buf);
}

8
src/parsing/sidecar.h Normal file
View File

@ -0,0 +1,8 @@
#ifndef SIST2_SIDECAR_H
#define SIST2_SIDECAR_H
#include "src/sist.h"
void parse_sidecar(vfile_t *vfile, document_t *doc);
#endif

View File

@ -23,9 +23,10 @@
#undef ABS
#define ABS(a) (((a) < 0) ? -(a) : (a))
#define UUID_STR_LEN 37
#define UNUSED(x) __attribute__((__unused__)) x
#define MD5_STR_LENGTH 33
#include "util.h"
#include "log.h"
#include "types.h"
@ -47,5 +48,4 @@
#include <errno.h>
#include <ctype.h>
#endif

View File

@ -1 +1 @@
.irs-bar,.irs-bar-edge,.irs-line-left,.irs-line-mid,.irs-line-right,.irs-slider{background:url("../img/sprite-skin-flat.png") repeat-x}.irs{height:40px}.irs-with-grid{height:60px}.irs-line{height:12px;top:25px}.irs-line-left{height:12px;background-position:0 -30px}.irs-line-mid{height:12px;background-position:0 0}.irs-line-right{height:12px;background-position:100% -30px}.irs-bar{height:12px;top:25px;background-position:0 -60px}.irs-bar-edge{top:25px;height:12px;width:9px;background-position:0 -90px}.irs-shadow{height:3px;top:34px;background:#000;opacity:0.25}.lt-ie9 .irs-shadow{filter: alpha(opacity=25)}.irs-slider{width:16px;height:18px;top:22px;background-position:0 -120px}.irs-slider.state_hover,.irs-slider:hover{background-position:0 -150px}.irs-max,.irs-min{color:#999;font-size:10px;line-height:1.333;text-shadow:none;top:0;padding:1px 3px;background:#e1e4e9;-moz-border-radius:4px;border-radius:4px}.irs-from,.irs-single,.irs-to{color:#fff;font-size:10px;line-height:1.333;text-shadow:none;padding:1px 5px;background:#2196F3;-moz-border-radius:4px;border-radius:4px}.irs-from:after,.irs-single:after,.irs-to:after{position:absolute;display:block;content:"";bottom:-6px;left:50%;width:0;height:0;margin-left:-3px;overflow:hidden;border:3px solid transparent;border-top-color:#2196F3}.irs-grid-pol{background:#e1e4e9}.irs-grid-text{color:#999}.irs-disabled{}
.irs-bar,.irs-bar-edge,.irs-line-left,.irs-line-mid,.irs-line-right,.irs-slider{background:url("./img/sprite-skin-flat.png") repeat-x}.irs{height:40px}.irs-with-grid{height:60px}.irs-line{height:12px;top:25px}.irs-line-left{height:12px;background-position:0 -30px}.irs-line-mid{height:12px;background-position:0 0}.irs-line-right{height:12px;background-position:100% -30px}.irs-bar{height:12px;top:25px;background-position:0 -60px}.irs-bar-edge{top:25px;height:12px;width:9px;background-position:0 -90px}.irs-shadow{height:3px;top:34px;background:#000;opacity:0.25}.lt-ie9 .irs-shadow{filter: alpha(opacity=25)}.irs-slider{width:16px;height:18px;top:22px;background-position:0 -120px}.irs-slider.state_hover,.irs-slider:hover{background-position:0 -150px}.irs-max,.irs-min{color:#999;font-size:10px;line-height:1.333;text-shadow:none;top:0;padding:1px 3px;background:#e1e4e9;-moz-border-radius:4px;border-radius:4px}.irs-from,.irs-single,.irs-to{color:#fff;font-size:10px;line-height:1.333;text-shadow:none;padding:1px 5px;background:#2196F3;-moz-border-radius:4px;border-radius:4px}.irs-from:after,.irs-single:after,.irs-to:after{position:absolute;display:block;content:"";bottom:-6px;left:50%;width:0;height:0;margin-left:-3px;overflow:hidden;border:3px solid transparent;border-top-color:#2196F3}.irs-grid-pol{background:#e1e4e9}.irs-grid-text{color:#999}.irs-disabled{}

1
src/static/js/8_md5.min.js vendored Normal file
View File

@ -0,0 +1 @@
!function(n){"use strict";function d(n,t){var r=(65535&n)+(65535&t);return(n>>16)+(t>>16)+(r>>16)<<16|65535&r}function f(n,t,r,e,o,u){return d((c=d(d(t,n),d(e,u)))<<(f=o)|c>>>32-f,r);var c,f}function l(n,t,r,e,o,u,c){return f(t&r|~t&e,n,t,o,u,c)}function v(n,t,r,e,o,u,c){return f(t&e|r&~e,n,t,o,u,c)}function g(n,t,r,e,o,u,c){return f(t^r^e,n,t,o,u,c)}function m(n,t,r,e,o,u,c){return f(r^(t|~e),n,t,o,u,c)}function i(n,t){var r,e,o,u;n[t>>5]|=128<<t%32,n[14+(t+64>>>9<<4)]=t;for(var c=1732584193,f=-271733879,i=-1732584194,a=271733878,h=0;h<n.length;h+=16)c=l(r=c,e=f,o=i,u=a,n[h],7,-680876936),a=l(a,c,f,i,n[h+1],12,-389564586),i=l(i,a,c,f,n[h+2],17,606105819),f=l(f,i,a,c,n[h+3],22,-1044525330),c=l(c,f,i,a,n[h+4],7,-176418897),a=l(a,c,f,i,n[h+5],12,1200080426),i=l(i,a,c,f,n[h+6],17,-1473231341),f=l(f,i,a,c,n[h+7],22,-45705983),c=l(c,f,i,a,n[h+8],7,1770035416),a=l(a,c,f,i,n[h+9],12,-1958414417),i=l(i,a,c,f,n[h+10],17,-42063),f=l(f,i,a,c,n[h+11],22,-1990404162),c=l(c,f,i,a,n[h+12],7,1804603682),a=l(a,c,f,i,n[h+13],12,-40341101),i=l(i,a,c,f,n[h+14],17,-1502002290),c=v(c,f=l(f,i,a,c,n[h+15],22,1236535329),i,a,n[h+1],5,-165796510),a=v(a,c,f,i,n[h+6],9,-1069501632),i=v(i,a,c,f,n[h+11],14,643717713),f=v(f,i,a,c,n[h],20,-373897302),c=v(c,f,i,a,n[h+5],5,-701558691),a=v(a,c,f,i,n[h+10],9,38016083),i=v(i,a,c,f,n[h+15],14,-660478335),f=v(f,i,a,c,n[h+4],20,-405537848),c=v(c,f,i,a,n[h+9],5,568446438),a=v(a,c,f,i,n[h+14],9,-1019803690),i=v(i,a,c,f,n[h+3],14,-187363961),f=v(f,i,a,c,n[h+8],20,1163531501),c=v(c,f,i,a,n[h+13],5,-1444681467),a=v(a,c,f,i,n[h+2],9,-51403784),i=v(i,a,c,f,n[h+7],14,1735328473),c=g(c,f=v(f,i,a,c,n[h+12],20,-1926607734),i,a,n[h+5],4,-378558),a=g(a,c,f,i,n[h+8],11,-2022574463),i=g(i,a,c,f,n[h+11],16,1839030562),f=g(f,i,a,c,n[h+14],23,-35309556),c=g(c,f,i,a,n[h+1],4,-1530992060),a=g(a,c,f,i,n[h+4],11,1272893353),i=g(i,a,c,f,n[h+7],16,-155497632),f=g(f,i,a,c,n[h+10],23,-1094730640),c=g(c,f,i,a,n[h+13],4,681279174),a=g(a,c,f,i,n[h],11,-358537222),i=g(i,a,c,f,n[h+3],16,-722521979),f=g(f,i,a,c,n[h+6],23,76029189),c=g(c,f,i,a,n[h+9],4,-640364487),a=g(a,c,f,i,n[h+12],11,-421815835),i=g(i,a,c,f,n[h+15],16,530742520),c=m(c,f=g(f,i,a,c,n[h+2],23,-995338651),i,a,n[h],6,-198630844),a=m(a,c,f,i,n[h+7],10,1126891415),i=m(i,a,c,f,n[h+14],15,-1416354905),f=m(f,i,a,c,n[h+5],21,-57434055),c=m(c,f,i,a,n[h+12],6,1700485571),a=m(a,c,f,i,n[h+3],10,-1894986606),i=m(i,a,c,f,n[h+10],15,-1051523),f=m(f,i,a,c,n[h+1],21,-2054922799),c=m(c,f,i,a,n[h+8],6,1873313359),a=m(a,c,f,i,n[h+15],10,-30611744),i=m(i,a,c,f,n[h+6],15,-1560198380),f=m(f,i,a,c,n[h+13],21,1309151649),c=m(c,f,i,a,n[h+4],6,-145523070),a=m(a,c,f,i,n[h+11],10,-1120210379),i=m(i,a,c,f,n[h+2],15,718787259),f=m(f,i,a,c,n[h+9],21,-343485551),c=d(c,r),f=d(f,e),i=d(i,o),a=d(a,u);return[c,f,i,a]}function a(n){for(var t="",r=32*n.length,e=0;e<r;e+=8)t+=String.fromCharCode(n[e>>5]>>>e%32&255);return t}function h(n){var t=[];for(t[(n.length>>2)-1]=void 0,e=0;e<t.length;e+=1)t[e]=0;for(var r=8*n.length,e=0;e<r;e+=8)t[e>>5]|=(255&n.charCodeAt(e/8))<<e%32;return t}function e(n){for(var t,r="0123456789abcdef",e="",o=0;o<n.length;o+=1)t=n.charCodeAt(o),e+=r.charAt(t>>>4&15)+r.charAt(15&t);return e}function r(n){return unescape(encodeURIComponent(n))}function o(n){return a(i(h(t=r(n)),8*t.length));var t}function u(n,t){return function(n,t){var r,e,o=h(n),u=[],c=[];for(u[15]=c[15]=void 0,16<o.length&&(o=i(o,8*n.length)),r=0;r<16;r+=1)u[r]=909522486^o[r],c[r]=1549556828^o[r];return e=i(u.concat(h(t)),512+8*t.length),a(i(c.concat(e),640))}(r(n),r(t))}function t(n,t,r){return t?r?u(t,n):e(u(t,n)):r?o(n):e(o(n))}"function"==typeof define&&define.amd?define(function(){return t}):"object"==typeof module&&module.exports?module.exports=t:n.md5=t}(this);

View File

@ -22,7 +22,7 @@ function gifOver(thumbnail, hit) {
thumbnail.addEventListener("mouseout", function () {
//Reset timer
thumbnail.mouseStayedOver = false;
thumbnail.setAttribute("src", `t/${hit["_source"]["index"]}/${hit["_id"]}`);
thumbnail.setAttribute("src", `t/${hit["_source"]["index"]}/${hit["_path_md5"]}`);
})
}
@ -419,7 +419,7 @@ function makeThumbnail(mimeCategory, hit, imgWrapper, small) {
thumbnail.setAttribute("class", "card-img-top fit");
}
}
thumbnail.setAttribute("src", `t/${hit["_source"]["index"]}/${hit["_id"]}`);
thumbnail.setAttribute("src", `t/${hit["_source"]["index"]}/${hit["_path_md5"]}`);
if (shouldDisplayRawImage(hit)) {
thumbnail.addEventListener("click", () => {

View File

@ -168,13 +168,13 @@ window.onload = () => {
};
function saveTag(tag, hit) {
const relPath = hit["_source"]["path"] + "/" + hit["_source"]["name"] + ext(hit);
const relPath = hit["_source"]["path"] + (hit["_source"]["path"] ? "/" : "") + hit["_source"]["name"] + ext(hit);
return $.jsonPost("/tag/" + hit["_source"]["index"], {
delete: false,
name: tag,
doc_id: hit["_id"],
relpath: relPath
path_md5: md5(relPath)
}).then(() => {
tagBar.blur();
$("#tagModal").modal("hide");
@ -604,6 +604,7 @@ function search(after = null) {
hits.forEach(hit => {
hit["_source"]["name"] = strUnescape(hit["_source"]["name"]);
hit["_source"]["path"] = strUnescape(hit["_source"]["path"]);
hit["_path_md5"] = md5(hit["_source"]["path"] + (hit["_source"]["path"] ? "/" : "") + hit["_source"]["name"] + ext(hit));
});
if (!after) {

View File

@ -12,9 +12,9 @@
<nav class="navbar navbar-expand-lg">
<a class="navbar-brand" href="/">sist2</a>
<span class="badge badge-pill version">2.8.4</span>
<span class="badge badge-pill version">2.9.0</span>
<span class="tagline">Lightning-fast file system indexer and search tool </span>
<a class="btn ml-auto" href="/stats">Stats</a>
<a class="btn ml-auto" href="stats">Stats</a>
<button class="btn" type="button" data-toggle="modal" data-target="#settings" onclick="loadSettings()">Settings
</button>
<button class="btn" title="Toggle theme" onclick="toggleTheme()">Theme</button>

View File

@ -10,7 +10,7 @@
<nav class="navbar navbar-expand-lg">
<a class="navbar-brand" href="/">sist2</a>
<span class="badge badge-pill version">2.8.4</span>
<span class="badge badge-pill version">2.9.0</span>
<span class="tagline">Lightning-fast file system indexer and search tool </span>
<a style="margin-left: auto" class="btn" href="/">Back</a>
<button class="btn" type="button" data-toggle="modal" data-target="#settings"
@ -29,13 +29,13 @@
</div>
<div id="treemap-card" class="stats-card">
<button class="btn stats-btn" onclick="fullScreen('treemap-card')">Enlarge</button>
<button class="btn stats-btn" onclick="fullScreen('treemap-card')" id="treemap-card-enlarge">Enlarge</button>
<button class="btn stats-btn" onclick="exportTreemap()">Export</button>
<svg id="treemap"></svg>
</div>
<div id="graphs-card" class="stats-card">
<button class="btn stats-btn" onclick="fullScreen('graphs-card')">Enlarge</button>
<button class="btn stats-btn" onclick="fullScreen('graphs-card')" id="graphs-card-enlarge">Enlarge</button>
<div class="graph">
<svg id="agg_mime_size"></svg>
</div>
@ -727,7 +727,7 @@ function updateStats() {
const indexId = $("#indices").val();
d3.csv(`/s/${indexId}/1`).then(tabularData => {
d3.csv(`./s/${indexId}/1`).then(tabularData => {
tabularData.forEach(row => {
row.taxonomy = row.path.split("/");
row.size = Number(row.size);
@ -742,16 +742,16 @@ function updateStats() {
}
});
d3.csv(`/s/${indexId}/2`).then(tabularData => {
d3.csv(`./s/${indexId}/2`).then(tabularData => {
mimeBarSize(tabularData.slice(), mimeSvgSize);
mimeBarCount(tabularData.slice(), mimeSvgCount);
});
d3.csv(`/s/${indexId}/3`).then(tabularData => {
d3.csv(`./s/${indexId}/3`).then(tabularData => {
sizeHistogram(tabularData, sizeHistogramSvg);
});
d3.csv(`/s/${indexId}/4`).then(tabularData => {
d3.csv(`./s/${indexId}/4`).then(tabularData => {
dateHistogram(tabularData, dateHistogramSvg);
});
@ -789,7 +789,15 @@ window.onload = function () {
function fullScreen(selector) {
const card = document.getElementById(selector);
const btn = document.getElementById(selector + "-enlarge");
card.classList.toggle("full-screen");
if (card.classList.contains("full-screen")) {
btn.innerText = "Shrink";
} else {
btn.innerText = "Enlarge";
}
}
function exportTreemap() {

View File

@ -2,8 +2,6 @@
#include "io/serialize.h"
#include "ctx.h"
#include <glib.h>
static GHashTable *FlatTree;
static GHashTable *BufferTable;
@ -22,7 +20,7 @@ typedef struct {
long count;
} agg_t;
void fill_tables(cJSON *document, UNUSED(const char uuid_str[UUID_STR_LEN])) {
void fill_tables(cJSON *document, UNUSED(const char index_id[MD5_STR_LENGTH])) {
if (cJSON_GetObjectItem(document, "parent") != NULL) {
return;
@ -103,8 +101,8 @@ void read_index_into_tables(index_t *index) {
while ((de = readdir(dir)) != NULL) {
if (strncmp(de->d_name, "_index_", sizeof("_index_") - 1) == 0) {
char file_path[PATH_MAX];
snprintf(file_path, PATH_MAX, "%s/%s", index->path, de->d_name);
read_index(file_path, index->desc.uuid, index->desc.type, fill_tables);
snprintf(file_path, PATH_MAX, "%s%s", index->path, de->d_name);
read_index(file_path, index->desc.id, index->desc.type, fill_tables);
}
}
closedir(dir);

View File

@ -6,7 +6,7 @@
#define INDEX_VERSION_EXTERNAL "_external_v1"
typedef struct index_descriptor {
char uuid[UUID_STR_LEN];
char id[MD5_STR_LENGTH];
char version[64];
long timestamp;
char root[PATH_MAX];
@ -20,6 +20,7 @@ typedef struct index_t {
struct index_descriptor desc;
struct store_t *store;
struct store_t *tag_store;
struct store_t *meta_store;
char path[PATH_MAX];
} index_t;

View File

@ -2,7 +2,6 @@
#include "src/ctx.h"
#include <wordexp.h>
#include <glib.h>
#define PBSTR "========================================"
#define PBWIDTH 40
@ -125,7 +124,7 @@ void progress_bar_print(double percentage, size_t tn_size, size_t index_size) {
}
GHashTable *incremental_get_table() {
GHashTable *file_table = g_hash_table_new(g_direct_hash, g_direct_equal);
GHashTable *file_table = g_hash_table_new_full(g_str_hash, g_str_equal, free, NULL);
return file_table;
}

View File

@ -10,6 +10,8 @@
#include "third-party/utf8.h/utf8.h"
#include "libscan/scan.h"
#define MD5_STR_LENGTH 33
char *abspath(const char *path);
@ -21,25 +23,6 @@ void progress_bar_print(double percentage, size_t tn_size, size_t index_size);
GHashTable *incremental_get_table();
__always_inline
static void incremental_put(GHashTable *table, unsigned long inode_no, int mtime) {
g_hash_table_insert(table, (gpointer) inode_no, GINT_TO_POINTER(mtime));
}
__always_inline
static int incremental_get(GHashTable *table, unsigned long inode_no) {
if (table != NULL) {
return GPOINTER_TO_INT(g_hash_table_lookup(table, (gpointer) inode_no));
} else {
return 0;
}
}
__always_inline
static int incremental_mark_file_for_copy(GHashTable *table, unsigned long inode_no) {
return g_hash_table_insert(table, GINT_TO_POINTER(inode_no), GINT_TO_POINTER(1));
}
const char *find_file_in_paths(const char **paths, const char *filename);
@ -48,4 +31,95 @@ void str_escape(char *dst, const char *str);
void str_unescape(char *dst, const char *str);
static int hex2buf(const char *str, int len, unsigned char *bytes) {
static const uint8_t hashmap[] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
0x08, 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
};
for (int pos = 0; pos < len; pos += 2) {
int idx0 = (uint8_t) str[pos + 0];
int idx1 = (uint8_t) str[pos + 1];
bytes[pos / 2] = (uint8_t) (hashmap[idx0] << 4) | hashmap[idx1];
}
return TRUE;
}
__always_inline
static void buf2hex(const unsigned char *buf, size_t buflen, char *hex_string) {
static const char hexdig[] = "0123456789abcdef";
const unsigned char *p;
size_t i;
char *s = hex_string;
for (i = 0, p = buf; i < buflen; i++, p++) {
*s++ = hexdig[(*p >> 4) & 0x0f];
*s++ = hexdig[*p & 0x0f];
}
*s = '\0';
}
__always_inline
static int md5_digest_is_null(const unsigned char digest[MD5_DIGEST_LENGTH]) {
return (*(int64_t *) digest) == 0 && (*((int64_t *) digest + 1)) == 0;
}
__always_inline
static void incremental_put(GHashTable *table, unsigned char path_md5[MD5_DIGEST_LENGTH], int mtime) {
char *ptr = malloc(MD5_STR_LENGTH);
buf2hex(path_md5, MD5_DIGEST_LENGTH, ptr);
g_hash_table_insert(table, ptr, GINT_TO_POINTER(mtime));
}
__always_inline
static int incremental_get(GHashTable *table, unsigned char path_md5[MD5_DIGEST_LENGTH]) {
if (table != NULL) {
char md5_str[MD5_STR_LENGTH];
buf2hex(path_md5, MD5_DIGEST_LENGTH, md5_str);
return GPOINTER_TO_INT(g_hash_table_lookup(table, md5_str));
} else {
return 0;
}
}
__always_inline
static int incremental_mark_file_for_copy(GHashTable *table, unsigned char path_md5[MD5_DIGEST_LENGTH]) {
char *ptr = malloc(MD5_STR_LENGTH);
buf2hex(path_md5, MD5_DIGEST_LENGTH, ptr);
return g_hash_table_insert(table, ptr, GINT_TO_POINTER(1));
}
#endif

View File

@ -36,7 +36,7 @@ static void send_response_line(struct mg_connection *nc, int status_code, int le
index_t *get_index_by_id(const char *index_id) {
for (int i = WebCtx.index_count; i >= 0; i--) {
if (strcmp(index_id, WebCtx.indices[i].desc.uuid) == 0) {
if (strncmp(index_id, WebCtx.indices[i].desc.id, MD5_STR_LENGTH) == 0) {
return &WebCtx.indices[i];
}
}
@ -73,17 +73,17 @@ void stats(struct mg_connection *nc) {
void stats_files(struct mg_connection *nc, struct http_message *hm, struct mg_str *path) {
if (path->len != UUID_STR_LEN + 4) {
if (path->len != MD5_STR_LENGTH + 4) {
mg_http_send_error(nc, 404, NULL);
nc->flags |= MG_F_SEND_AND_CLOSE;
return;
}
char arg_uuid[UUID_STR_LEN];
memcpy(arg_uuid, hm->uri.p + 3, UUID_STR_LEN);
*(arg_uuid + UUID_STR_LEN - 1) = '\0';
char arg_md5[MD5_STR_LENGTH];
memcpy(arg_md5, hm->uri.p + 3, MD5_STR_LENGTH);
*(arg_md5 + MD5_STR_LENGTH - 1) = '\0';
index_t *index = get_index_by_id(arg_uuid);
index_t *index = get_index_by_id(arg_md5);
if (index == NULL) {
mg_http_send_error(nc, 404, NULL);
nc->flags |= MG_F_SEND_AND_CLOSE;
@ -91,7 +91,7 @@ void stats_files(struct mg_connection *nc, struct http_message *hm, struct mg_st
}
const char *file;
switch (atoi(hm->uri.p + 3 + UUID_STR_LEN)) {
switch (atoi(hm->uri.p + 3 + MD5_STR_LENGTH)) {
case 1:
file = "treemap.csv";
break;
@ -179,29 +179,23 @@ void img_sprite_skin_flat(struct mg_connection *nc, struct http_message *hm) {
void thumbnail(struct mg_connection *nc, struct http_message *hm, struct mg_str *path) {
if (path->len != UUID_STR_LEN * 2 + 2) {
if (path->len != 68) {
LOG_DEBUGF("serve.c", "Invalid thumbnail path: %.*s", (int) path->len, path->p)
mg_http_send_error(nc, 404, NULL);
nc->flags |= MG_F_SEND_AND_CLOSE;
return;
}
char arg_uuid[UUID_STR_LEN];
char arg_index[UUID_STR_LEN];
char arg_file_md5[MD5_STR_LENGTH];
char arg_index[MD5_STR_LENGTH];
memcpy(arg_index, hm->uri.p + 3, UUID_STR_LEN);
*(arg_index + UUID_STR_LEN - 1) = '\0';
memcpy(arg_uuid, hm->uri.p + 3 + UUID_STR_LEN, UUID_STR_LEN);
*(arg_uuid + UUID_STR_LEN - 1) = '\0';
memcpy(arg_index, hm->uri.p + 3, MD5_STR_LENGTH);
*(arg_index + MD5_STR_LENGTH - 1) = '\0';
memcpy(arg_file_md5, hm->uri.p + 3 + MD5_STR_LENGTH, MD5_STR_LENGTH);
*(arg_file_md5 + MD5_STR_LENGTH - 1) = '\0';
uuid_t uuid;
int ret = uuid_parse(arg_uuid, uuid);
if (ret != 0) {
LOG_DEBUGF("serve.c", "Invalid thumbnail UUID: %s", arg_uuid)
mg_http_send_error(nc, 404, NULL);
nc->flags |= MG_F_SEND_AND_CLOSE;
return;
}
unsigned char md5_buf[MD5_DIGEST_LENGTH];
hex2buf(arg_file_md5, MD5_STR_LENGTH - 1, md5_buf);
store_t *store = get_store(arg_index);
if (store == NULL) {
@ -212,7 +206,7 @@ void thumbnail(struct mg_connection *nc, struct http_message *hm, struct mg_str
}
size_t data_len = 0;
char *data = store_read(store, (char *) uuid, sizeof(uuid_t), &data_len);
char *data = store_read(store, (char *) md5_buf, sizeof(md5_buf), &data_len);
if (data_len != 0) {
send_response_line(nc, 200, data_len, "Content-Type: image/jpeg");
mg_send(nc, data, data_len);
@ -305,7 +299,7 @@ void index_info(struct mg_connection *nc) {
cJSON *idx_json = cJSON_CreateObject();
cJSON_AddStringToObject(idx_json, "name", idx->desc.name);
cJSON_AddStringToObject(idx_json, "version", idx->desc.version);
cJSON_AddStringToObject(idx_json, "id", idx->desc.uuid);
cJSON_AddStringToObject(idx_json, "id", idx->desc.id);
cJSON_AddNumberToObject(idx_json, "timestamp", (double) idx->desc.timestamp);
cJSON_AddItemToArray(arr, idx_json);
}
@ -323,18 +317,18 @@ void index_info(struct mg_connection *nc) {
void document_info(struct mg_connection *nc, struct http_message *hm, struct mg_str *path) {
if (path->len != UUID_STR_LEN + 2) {
if (path->len != MD5_STR_LENGTH + 2) {
LOG_DEBUGF("serve.c", "Invalid document_info path: %.*s", (int) path->len, path->p)
mg_http_send_error(nc, 404, NULL);
nc->flags |= MG_F_SEND_AND_CLOSE;
return;
}
char arg_uuid[UUID_STR_LEN];
memcpy(arg_uuid, hm->uri.p + 3, UUID_STR_LEN);
*(arg_uuid + UUID_STR_LEN - 1) = '\0';
char arg_md5[MD5_STR_LENGTH];
memcpy(arg_md5, hm->uri.p + 3, MD5_STR_LENGTH);
*(arg_md5 + MD5_STR_LENGTH - 1) = '\0';
cJSON *doc = elastic_get_document(arg_uuid);
cJSON *doc = elastic_get_document(arg_md5);
cJSON *source = cJSON_GetObjectItem(doc, "_source");
cJSON *index_id = cJSON_GetObjectItem(source, "index");
@ -364,18 +358,18 @@ void document_info(struct mg_connection *nc, struct http_message *hm, struct mg_
void file(struct mg_connection *nc, struct http_message *hm, struct mg_str *path) {
if (path->len != UUID_STR_LEN + 2) {
if (path->len != MD5_STR_LENGTH + 2) {
LOG_DEBUGF("serve.c", "Invalid file path: %.*s", (int) path->len, path->p)
mg_http_send_error(nc, 404, NULL);
nc->flags |= MG_F_SEND_AND_CLOSE;
return;
}
char arg_uuid[UUID_STR_LEN];
memcpy(arg_uuid, hm->uri.p + 3, UUID_STR_LEN);
*(arg_uuid + UUID_STR_LEN - 1) = '\0';
char arg_md5[MD5_STR_LENGTH];
memcpy(arg_md5, hm->uri.p + 3, MD5_STR_LENGTH);
*(arg_md5 + MD5_STR_LENGTH - 1) = '\0';
const char *next = arg_uuid;
const char *next = arg_md5;
cJSON *doc = NULL;
cJSON *index_id = NULL;
cJSON *source = NULL;
@ -430,7 +424,7 @@ void status(struct mg_connection *nc) {
typedef struct {
char *name;
int delete;
char *relpath;
char *path_md5_str;
char *doc_id;
} tag_req_t;
@ -450,8 +444,9 @@ tag_req_t *parse_tag_request(cJSON *json) {
return NULL;
}
cJSON *arg_relpath = cJSON_GetObjectItem(json, "relpath");
if (arg_relpath == NULL || !cJSON_IsString(arg_relpath)) {
cJSON *arg_path_md5 = cJSON_GetObjectItem(json, "path_md5");
if (arg_path_md5 == NULL || !cJSON_IsString(arg_path_md5) ||
strlen(arg_path_md5->valuestring) != MD5_STR_LENGTH - 1) {
return NULL;
}
@ -463,23 +458,23 @@ tag_req_t *parse_tag_request(cJSON *json) {
tag_req_t *req = malloc(sizeof(tag_req_t));
req->delete = arg_delete->valueint;
req->name = arg_name->valuestring;
req->relpath = arg_relpath->valuestring;
req->path_md5_str = arg_path_md5->valuestring;
req->doc_id = arg_doc_id->valuestring;
return req;
}
void tag(struct mg_connection *nc, struct http_message *hm, struct mg_str *path) {
if (path->len != UUID_STR_LEN + 4) {
if (path->len != MD5_STR_LENGTH + 4) {
LOG_DEBUGF("serve.c", "Invalid tag path: %.*s", (int) path->len, path->p)
mg_http_send_error(nc, 404, NULL);
nc->flags |= MG_F_SEND_AND_CLOSE;
return;
}
char arg_index[UUID_STR_LEN];
memcpy(arg_index, hm->uri.p + 5, UUID_STR_LEN);
*(arg_index + UUID_STR_LEN - 1) = '\0';
char arg_index[MD5_STR_LENGTH];
memcpy(arg_index, hm->uri.p + 5, MD5_STR_LENGTH);
*(arg_index + MD5_STR_LENGTH - 1) = '\0';
if (hm->body.len < 2 || hm->method.len != 4 || memcmp(&hm->method, "POST", 4) == 0) {
LOG_DEBUG("serve.c", "Invalid tag request")
@ -514,7 +509,7 @@ void tag(struct mg_connection *nc, struct http_message *hm, struct mg_str *path)
cJSON *arr = NULL;
size_t data_len = 0;
const char *data = store_read(store, arg_req->relpath, strlen(arg_req->relpath), &data_len);
const char *data = store_read(store, arg_req->path_md5_str, MD5_STR_LENGTH, &data_len);
if (data_len == 0) {
arr = cJSON_CreateArray();
} else {
@ -574,7 +569,8 @@ void tag(struct mg_connection *nc, struct http_message *hm, struct mg_str *path)
}
char *json_str = cJSON_PrintUnformatted(arr);
store_write(store, arg_req->relpath, strlen(arg_req->relpath) + 1, json_str, strlen(json_str) + 1);
store_write(store, arg_req->path_md5_str, MD5_STR_LENGTH, json_str, strlen(json_str) + 1);
store_flush(store);
free(arg_req);
free(json_str);

File diff suppressed because one or more lines are too long

75
tests/test_scan.py Normal file
View File

@ -0,0 +1,75 @@
import unittest
import subprocess
import shutil
import json
import os
TEST_FILES = "third-party/libscan/libscan-test-files/test_files"
def copy_files(files):
base = os.path.basename(files)
new_path = os.path.join("/tmp/sist2_test/", base)
shutil.rmtree(new_path, ignore_errors=True)
shutil.copytree(files, new_path)
return new_path
def sist2(*args):
return subprocess.check_output(
args=["./sist2_debug", *args],
)
def sist2_index(files, *args):
path = copy_files(files)
shutil.rmtree("i", ignore_errors=True)
sist2("scan", path, "-o", "i", *args)
return iter(sist2_index_to_dict("i"))
def sist2_incremental_index(files, func=None, *args):
path = copy_files(files)
if func:
func(path)
shutil.rmtree("i_inc", ignore_errors=True)
sist2("scan", path, "-o", "i_inc", "--incremental", "i", *args)
return iter(sist2_index_to_dict("i_inc"))
def sist2_index_to_dict(index):
res = subprocess.check_output(
args=["./sist2_debug", "index", "--print", index],
)
for line in res.splitlines():
if line:
yield json.loads(line)
class ScanTest(unittest.TestCase):
def test_incremental1(self):
def remove_files(path):
os.remove(os.path.join(path, "msdoc/test1.doc"))
os.remove(os.path.join(path, "msdoc/test2.doc"))
def add_files(path):
with open(os.path.join(path, "newfile1"), "w"):
pass
with open(os.path.join(path, "newfile2"), "w"):
pass
with open(os.path.join(path, "newfile3"), "w"):
pass
file_count = sum(1 for _ in sist2_index(TEST_FILES))
self.assertEqual(sum(1 for _ in sist2_incremental_index(TEST_FILES, remove_files)), file_count - 2)
self.assertEqual(sum(1 for _ in sist2_incremental_index(TEST_FILES, add_files)), file_count + 3)
if __name__ == "__main__":
unittest.main()

@ -1 +1 @@
Subproject commit 4ed6099cb33245b06343518b9f3c45ac56e8283c
Subproject commit 3f4e3594a6891b942d5a711781d5425111aa13bf

2
third-party/libscan vendored

@ -1 +1 @@
Subproject commit 21f1f4b98ad3b29839379e138a8718f192876a25
Subproject commit ae9fadec473e6e4ade05259fe359c5366c3f3af6