Compare commits

...

8 Commits

Author SHA1 Message Date
e72fa1587b EXIF metadata for images 2019-11-09 15:18:44 -05:00
ea4fb7fa0d Bug fixes 2019-11-09 12:00:07 -05:00
b0a868bb73 remove 'must match' 2019-11-08 21:46:54 -05:00
d761a3b595 update readme 2019-11-08 19:42:36 -05:00
2d7a8a2fdc fuzzy toggle 2019-11-08 16:15:10 -05:00
152d2ddf8a bug fix in deserialize 2019-11-08 09:03:44 -05:00
bc5f22b759 update readme 2019-11-05 18:59:00 -05:00
534b397876 update readme, UI tweak: don't show broken images 2019-11-03 10:39:02 -05:00
12 changed files with 99 additions and 61 deletions

View File

@@ -9,7 +9,7 @@ sist2 (Simple incremental search tool)
## Features
* Fast, low memory usage
* Fast, low memory usage, multi-threaded
* Portable (all its features are packaged in a single executable)
* Extracts text from common file types\*
* Generates thumbnails\*
@@ -56,10 +56,10 @@ sist2 web --bind 0.0.0.0 --port 4321 ./my_idx1 ./my_idx2 ./my_idx3
File type | Library | Content | Thumbnail | Metadata
:---|:---|:---|:---|:---
pdf,xps,cbz,cbr,fb2,epub | MuPDF | yes | yes, `png` | title |
`audio/*` | libav | - | yes, `jpeg` | ID3 tags |
`video/*` | libav | - | yes, `jpeg` | title, comment |
`image/*` | libav | - | yes, `jpeg` | *planned* |
pdf,xps,cbz,fb2,epub | MuPDF | yes | yes, `png` | title |
`audio/*` | ffmpeg | - | yes, `jpeg` | ID3 tags |
`video/*` | ffmpeg | - | yes, `jpeg` | title, comment, artist |
`image/*` | ffmpeg | - | yes, `jpeg` | `EXIF:Artist`, `EXIF:ImageDescription` |
ttf,ttc,cff,woff,fnt,otf | Freetype2 | - | yes, `bmp` | Name & style |
`text/plain` | *(none)* | yes | no | - |
docx, xlsx, pptx | | *planned* | no | *planned* |
@@ -79,11 +79,12 @@ binaries.
apt install git cmake pkg-config libglib2.0-dev\
libssl-dev uuid-dev libavformat-dev libswscale-dev \
python3 libmagic-dev libfreetype6-dev libcurl-dev \
libbz2-dev yasm
libbz2-dev yasm libharfbuzz-dev ragel
```
*(FreeBSD)*
```bash
pkg install cmake gcc yasm gmake bash ffmpeg e2fsprogs-uuid
pkg install cmake gcc yasm gmake bash ffmpeg e2fsprogs-uuid\
autotools ragel
```
2. Build

View File

@@ -66,7 +66,7 @@ index_descriptor_t read_index_descriptor(char *path) {
strcpy(descriptor.root, cJSON_GetObjectItem(json, "root")->valuestring);
strcpy(descriptor.name, cJSON_GetObjectItem(json, "name")->valuestring);
strcpy(descriptor.rewrite_url, cJSON_GetObjectItem(json, "rewrite_url")->valuestring);
descriptor.root_len = (short)strlen(descriptor.root);
descriptor.root_len = (short) strlen(descriptor.root);
strcpy(descriptor.version, cJSON_GetObjectItem(json, "version")->valuestring);
strcpy(descriptor.uuid, cJSON_GetObjectItem(json, "uuid")->valuestring);
@@ -181,7 +181,7 @@ void read_index(const char *path, const char index_id[UUID_STR_LEN], index_func
uuid_unparse(line.uuid, uuid_str);
cJSON_AddStringToObject(document, "mime", mime_get_mime_text(line.mime));
cJSON_AddNumberToObject(document, "size", (double)line.size);
cJSON_AddNumberToObject(document, "size", (double) line.size);
cJSON_AddNumberToObject(document, "mtime", line.mtime);
int c;
@@ -208,14 +208,19 @@ void read_index(const char *path, const char index_id[UUID_STR_LEN], index_func
while (key != '\n') {
switch (key) {
case MetaWidth:
case MetaHeight:
case MetaMediaDuration:
case MetaMediaBitrate: {
case MetaHeight: {
int value;
fread(&value, sizeof(int), 1, file);
cJSON_AddNumberToObject(document, get_meta_key_text(key), value);
break;
}
case MetaMediaDuration:
case MetaMediaBitrate: {
long value;
fread(&value, sizeof(long), 1, file);
cJSON_AddNumberToObject(document, get_meta_key_text(key), value);
break;
}
case MetaMediaAudioCodec:
case MetaMediaVideoCodec: {
int value;
@@ -245,7 +250,7 @@ void read_index(const char *path, const char index_id[UUID_STR_LEN], index_func
break;
}
default:
fprintf(stderr, "Invalid meta key (corrupt index): %x", key);
fprintf(stderr, "Invalid meta key (corrupt index): %x\n", key);
break;
}

View File

@@ -10,7 +10,7 @@
#define EPILOG "Made by simon987 <me@simon987.net>. Released under GPL-3.0"
static const char *const Version = "1.1.0";
static const char *const Version = "1.1.3";
static const char *const usage[] = {
"sist2 scan [OPTION]... PATH",
"sist2 index [OPTION]... INDEX",

View File

@@ -142,6 +142,9 @@ void parse_font(const char *buf, size_t buf_len, document_t *doc) {
if (library == NULL) {
FT_Init_FreeType(&library);
}
if (buf == NULL) {
return;
}
FT_Face face;
FT_Error err = FT_New_Memory_Face(library, (unsigned char *) buf, buf_len, 0, &face);

View File

@@ -116,9 +116,9 @@ AVFrame *read_frame(AVFormatContext *pFormatCtx, AVCodecContext *decoder, int st
return frame;
}
#define APPEND_TAG_META(doc, tag, keyname) \
#define APPEND_TAG_META(doc, tag_, keyname) \
text_buffer_t tex = text_buffer_create(-1); \
text_buffer_append_string0(&tex, tag->value); \
text_buffer_append_string0(&tex, tag_->value); \
meta_line_t *meta_tag = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur); \
meta_tag->key = keyname; \
strcpy(meta_tag->strval, tex.dyn_buffer.buf); \
@@ -151,30 +151,39 @@ void append_audio_meta(AVFormatContext *pFormatCtx, document_t *doc) {
}
__always_inline
void append_video_meta(AVFormatContext *pFormatCtx, document_t *doc, int include_audio_tags) {
void append_video_meta(AVFormatContext *pFormatCtx, AVFrame *frame, document_t *doc, int include_audio_tags, int is_video) {
meta_line_t *meta_duration = malloc(sizeof(meta_line_t));
meta_duration->key = MetaMediaDuration;
meta_duration->longval = pFormatCtx->duration / AV_TIME_BASE;
APPEND_META(doc, meta_duration)
if (is_video) {
meta_line_t *meta_duration = malloc(sizeof(meta_line_t));
meta_duration->key = MetaMediaDuration;
meta_duration->longval = pFormatCtx->duration / AV_TIME_BASE;
APPEND_META(doc, meta_duration)
meta_line_t *meta_bitrate = malloc(sizeof(meta_line_t));
meta_bitrate->key = MetaMediaBitrate;
meta_bitrate->longval = pFormatCtx->bit_rate;
APPEND_META(doc, meta_bitrate)
meta_line_t *meta_bitrate = malloc(sizeof(meta_line_t));
meta_bitrate->key = MetaMediaBitrate;
meta_bitrate->longval = pFormatCtx->bit_rate;
APPEND_META(doc, meta_bitrate)
}
AVDictionaryEntry *tag = NULL;
while ((tag = av_dict_get(pFormatCtx->metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) {
char key[32];
strncpy(key, tag->key, sizeof(key));
char *ptr = key;
for (; *ptr; ++ptr) *ptr = (char) tolower(*ptr);
if (strcmp(key, "title") == 0 && include_audio_tags) {
APPEND_TAG_META(doc, tag, MetaTitle)
} else if (strcmp(key, "comment") == 0) {
APPEND_TAG_META(doc, tag, MetaContent)
if (is_video) {
while ((tag = av_dict_get(pFormatCtx->metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) {
if (include_audio_tags && strcmp(tag->key, "title") == 0) {
APPEND_TAG_META(doc, tag, MetaTitle)
} else if (strcmp(tag->key, "comment") == 0) {
APPEND_TAG_META(doc, tag, MetaContent)
} else if (include_audio_tags && strcmp(tag->key, "artist") == 0) {
APPEND_TAG_META(doc, tag, MetaArtist)
}
}
} else {
// EXIF metadata
while ((tag = av_dict_get(frame->metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) {
if (include_audio_tags && strcmp(tag->key, "Artist") == 0) {
APPEND_TAG_META(doc, tag, MetaArtist)
} else if (strcmp(tag->key, "ImageDescription") == 0) {
APPEND_TAG_META(doc, tag, MetaContent)
}
}
}
}
@@ -236,11 +245,6 @@ void parse_media(const char *filepath, document_t *doc) {
if (video_stream != -1) {
AVStream *stream = pFormatCtx->streams[video_stream];
if (stream->nb_frames > 1) {
//This is a video (not a still image)
append_video_meta(pFormatCtx, doc, audio_stream == -1);
}
if (stream->codecpar->width <= MIN_SIZE || stream->codecpar->height <= MIN_SIZE) {
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
@@ -273,6 +277,8 @@ void parse_media(const char *filepath, document_t *doc) {
return;
}
append_video_meta(pFormatCtx, frame, doc, audio_stream == -1, stream->nb_frames > 1);
// Scale frame
AVFrame *scaled_frame = scale_frame(decoder, frame, ScanCtx.tn_size);

View File

@@ -16,7 +16,6 @@ void *read_all(parse_job_t *job, const char *buf, int bytes_read, int *fd) {
if (*fd == -1) {
perror("open");
printf("%s\n", job->filepath);
free(job);
return NULL;
}
}
@@ -25,6 +24,7 @@ void *read_all(parse_job_t *job, const char *buf, int bytes_read, int *fd) {
int ret = read(*fd, full_buf + bytes_read, job->info.st_size - bytes_read);
if (ret == -1) {
perror("read");
return NULL;
}
}
@@ -108,7 +108,7 @@ void parse(void *arg) {
void *pdf_buf = read_all(job, (char *) buf, bytes_read, &fd);
parse_pdf(pdf_buf, doc.size, &doc);
if (pdf_buf != buf) {
if (pdf_buf != buf && pdf_buf != NULL) {
free(pdf_buf);
}
@@ -119,7 +119,7 @@ void parse(void *arg) {
void *font_buf = read_all(job, (char *) buf, bytes_read, &fd);
parse_font(font_buf, doc.size, &doc);
if (font_buf != buf) {
if (font_buf != buf && font_buf != NULL) {
free(font_buf);
}
}

View File

@@ -114,6 +114,10 @@ int read_stext_block(fz_stext_block *block, text_buffer_t *tex) {
void parse_pdf(void *buf, size_t buf_len, document_t *doc) {
if (buf == NULL) {
return;
}
static int mu_is_initialized = 0;
if (!mu_is_initialized) {
pthread_mutex_init(&ScanCtx.mupdf_mu, NULL);

View File

@@ -90,7 +90,7 @@ void text_buffer_terminate_string(text_buffer_t *buf) {
}
__always_inline
int utf8_validchr(const char* s) {
int utf8_validchr(const char *s) {
if (0x00 == (0x80 & *s)) {
return TRUE;
} else if (0xf0 == (0xf8 & *s)) {
@@ -130,7 +130,7 @@ int utf8_validchr(const char* s) {
if (0 == (0x1e & s[0])) {
return FALSE;
}
} else {
} else {
return FALSE;
}
@@ -140,12 +140,22 @@ int utf8_validchr(const char* s) {
int text_buffer_append_string(text_buffer_t *buf, char *str, size_t len) {
utf8_int32_t c;
for (void *v = utf8codepoint(str, &c); c != '\0' && ((char*)v - str + 4) < len; v = utf8codepoint(v, &c)) {
if (str == NULL || len < 1 ||
(0xf0 == (0xf8 & str[0]) && len < 4) ||
(0xe0 == (0xf0 & str[0]) && len < 3) ||
(0xc0 == (0xe0 & str[0]) && len == 1) ||
*(str) == 0) {
text_buffer_terminate_string(buf);
return 0;
}
for (void *v = utf8codepoint(str, &c); c != '\0' && ((char *) v - str + 4) < len; v = utf8codepoint(v, &c)) {
if (utf8_validchr(v)) {
text_buffer_append_char(buf, c);
}
}
text_buffer_terminate_string(buf);
return 0;
}
int text_buffer_append_string0(text_buffer_t *buf, char *str) {

File diff suppressed because one or more lines are too long

View File

@@ -136,6 +136,9 @@ function createDocCard(hit) {
thumbnail = document.createElement("img");
thumbnail.setAttribute("class", "card-img-top fit");
thumbnail.setAttribute("src", `t/${hit["_source"]["index"]}/${hit["_id"]}`);
thumbnail.addEventListener("error", () => {
imgWrapper.remove();
});
}
//Thumbnail overlay

View File

@@ -32,7 +32,7 @@ window.onload = () => {
})
};
function toggleSearchBar() {
function toggleFuzzy() {
searchDebounced();
}
@@ -218,11 +218,22 @@ function search() {
let query = searchBar.value;
let empty = query === "";
let condition = $("#barToggle").prop("checked") && !empty ? "must" : "should";
let condition = empty ? "should" : "must";
let filters = [
{range: {size: {gte: size_min, lte: size_max}}},
{terms: {index: selectedIndices}}
];
let fields = [
"name^8",
"content^3",
"album^8", "artist^8", "title^8", "genre^2", "album_artist^8",
"font_name^6"
];
if ($("#fuzzyToggle").prop("checked")) {
fields.push("content.nGram");
fields.push("name.nGram^3");
}
let path = pathBar.value.replace(/\/$/, "").toLowerCase(); //remove trailing slashes
if (path !== "") {
@@ -243,12 +254,7 @@ function search() {
multi_match: {
query: query,
type: "most_fields",
fields: [
"name^8", "name.nGram^3", "content^3",
"content.nGram",
"album^8", "artist^8", "title^8", "genre^2", "album_artist^8",
"font_name^6"
],
fields: fields,
operator: "and"
}
},
@@ -265,7 +271,7 @@ function search() {
content: {},
name: {},
"name.nGram": {},
// font_name: {},
font_name: {},
}
},
aggs: {

View File

@@ -24,9 +24,9 @@
<div class="input-group">
<div class="input-group-prepend">
<div class="input-group-text">
<span onclick="document.getElementById('barToggle').click()">Must match&nbsp</span>
<input title="Toggle between 'Should' and 'Must' match mode" type="checkbox" id="barToggle"
onclick="toggleSearchBar()" checked>
<span onclick="document.getElementById('fuzzyToggle').click()">Fuzzy&nbsp</span>
<input title="Toggle fuzzy searching" type="checkbox" id="fuzzyToggle"
onclick="toggleFuzzy()" checked>
</div>
</div>
<input id="searchBar" type="search" class="form-control" placeholder="Search">