Compare commits

...

16 Commits

Author SHA1 Message Date
046edea0e2 Handle special characters in file paths 2020-06-10 19:45:36 -04:00
a011b7e97b Fragment size setting 2020-06-09 21:40:53 -04:00
8c1c1697e0 Fix file wordexp in some paths #59 2020-06-05 19:41:02 -04:00
018b49fa4c Fix csv_escape #58 2020-06-05 19:13:03 -04:00
27b4e6403e Re-enable path autocomplete #54 2020-06-02 19:46:58 -04:00
13fdbd9e69 Fix for ES 7.7 #54 2020-06-01 18:14:34 -04:00
5e7fdaf8dd Update issue-template.md 2020-06-01 10:45:43 -04:00
19d5c8ac9f Update issue-template.md 2020-05-29 18:19:21 -04:00
99497049a8 Merge pull request #53 from dpieski/patch-1
Update README
2020-05-29 18:16:13 -04:00
Andrew
1a3181d78b Update README
changed case of path in a link to the usage guide to fix 404 error.
2020-05-29 15:37:20 -05:00
449aa77c8f Fix for unknown mime inside archives 2020-05-25 17:36:04 -04:00
3058c55510 Memory leak fix #37 2020-05-24 15:42:42 -04:00
dedf9287b2 Fix name separation in --archive list mode 2020-05-24 14:36:59 -04:00
ab199b0c0c Remove arc_reset() function because seek() inside archive doesn't work 2020-05-24 14:18:31 -04:00
c4fbae123e Better support for media files inside archives 2020-05-24 14:10:23 -04:00
dd2397ef5c handle .tgz #44, ignore files inside archives for stats page 2020-05-24 10:10:28 -04:00
25 changed files with 213 additions and 64 deletions

View File

@@ -9,7 +9,9 @@ assignees: ''
sist2 version:
Platform (please indicate if you're using Docker):
Platform (Linux or Docker):
Elasticsearch version:
Command with arguments: `ex: "scan ~/Documents -o ./i2 --threads 3 -q 1.0`

View File

@@ -53,7 +53,7 @@ sist2 (Simple incremental search tool)
1. *(or)* Download a [development snapshot](https://files.simon987.net/artifacts/Sist2/Build/) *(Not recommended!)*
1. *(or)* `docker pull simon987/sist2:latest`
1. See [Usage guide](DOCS/USAGE.md)
1. See [Usage guide](docs/USAGE.md)
\* *Windows users*: **sist2** runs under [WSL](https://en.wikipedia.org/wiki/Windows_Subsystem_for_Linux)
@@ -61,7 +61,7 @@ sist2 (Simple incremental search tool)
## Example usage
See [Usage guide](DOCS/USAGE.md) for more details
See [Usage guide](docs/USAGE.md) for more details
1. Scan a directory: `sist2 scan ~/Documents -o ./docs_idx`
1. Push index to Elasticsearch: `sist2 index ./docs_idx`
@@ -91,14 +91,12 @@ they were directly in the file system. Recursive (archives inside archives)
scan is also supported.
**Limitations**:
* Parsing media files with formats that require
*seek* (e.g. `.gif`, `.mp4` w/ fragmented metadata etc.) is not supported.
* Support for parsing media files with formats that require *seek* (e.g. `.gif`, `.mp4` w/ fragmented metadata etc.)
is limitted (see `--mem-buffer` option)
* Archive files are scanned sequentially, by a single thread. On systems where
**sist2** is not I/O bound, scans might be faster when larger archives are split
into smaller parts.
To check if a media file can be parsed without *seek*, execute `cat file.mp4 | ffprobe -`
### OCR

View File

@@ -40,6 +40,9 @@ Scan options
--ocr=<str> Tesseract language (use tesseract --list-langs to see which are installed on your machine)
-e, --exclude=<str> Files that match this regex will not be scanned
--fast Only index file names & mime type
--treemap-threshold=<str> Relative size threshold for treemap (see USAGE.md). DEFAULT: 0.0005
--mem-buffer=<int> Maximum memory buffer size in MB for files inside archives (see USAGE.md). DEFAULT: 2000
Index options
--es-url=<str> Elasticsearch url with port. DEFAULT=http://localhost:9200
@@ -102,6 +105,11 @@ Made by simon987 <me@simon987.net>. Released under GPL-3.0
In effect, smaller `treemap-threshold` values will yield a more detailed
(but also a more cluttered and harder to read) visualization.
* `--mem-buffer` Maximum memory buffer size in MB (per thread) for files inside archives. Media files
larger than this number will be read sequentially and no *seek* operations will be supported.
To check if a media file can be parsed without *seek*, execute `cat file.mp4 | ffprobe -`
### Scan examples
Simple scan

View File

@@ -10,6 +10,7 @@
"path": {
"type": "text",
"analyzer": "path_analyzer",
"copy_to": "suggest-path",
"fielddata": true,
"fields": {
"nGram": {
@@ -22,6 +23,10 @@
}
}
},
"suggest-path": {
"type": "completion",
"analyzer": "case_insensitive_kw_analyzer"
},
"mime": {
"type": "keyword"
},

View File

@@ -13,7 +13,7 @@ application/epub+zip, epub
application/freeloader, frl
application/futuresplash, spl
application/groupwise, vew
application/gzip, gz
application/gzip, gz|tgz
application/hta, hta
application/i-deas, unv
application/iges, iges|igs
@@ -429,4 +429,4 @@ video/x-qtc, qtc
video/x-sgi-movie, movie|mv
x-epoc/x-sisx-app,
application/x-zstd-dictionary,
application/vnd.ms-outlook,
application/vnd.ms-outlook, msg
1 application/arj arj
13 application/freeloader frl
14 application/futuresplash spl
15 application/groupwise vew
16 application/gzip gz gz|tgz
17 application/hta hta
18 application/i-deas unv
19 application/iges iges|igs
429 video/x-sgi-movie movie|mv
430 x-epoc/x-sisx-app
431 application/x-zstd-dictionary
432 application/vnd.ms-outlook msg

View File

@@ -14,6 +14,8 @@
#define DEFAULT_LISTEN_ADDRESS "localhost:4090"
#define DEFAULT_TREEMAP_THRESHOLD 0.0005
#define DEFAULT_MAX_MEM_BUFFER 2000
const char* TESS_DATAPATHS[] = {
"/usr/share/tessdata/",
"/usr/share/tesseract-ocr/tessdata/",
@@ -187,6 +189,10 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
args->treemap_threshold = atof(args->treemap_threshold_str);
}
if (args->max_memory_buffer == 0) {
args->max_memory_buffer = DEFAULT_MAX_MEM_BUFFER;
}
LOG_DEBUGF("cli.c", "arg quality=%f", args->quality)
LOG_DEBUGF("cli.c", "arg size=%d", args->size)
LOG_DEBUGF("cli.c", "arg content_size=%d", args->content_size)
@@ -203,6 +209,7 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
LOG_DEBUGF("cli.c", "arg exclude=%s", args->exclude_regex)
LOG_DEBUGF("cli.c", "arg fast=%d", args->fast)
LOG_DEBUGF("cli.c", "arg treemap_threshold=%f", args->treemap_threshold)
LOG_DEBUGF("cli.c", "arg max_memory_buffer=%d", args->max_memory_buffer)
return 0;
}

View File

@@ -24,6 +24,7 @@ typedef struct scan_args {
int fast;
const char* treemap_threshold_str;
double treemap_threshold;
int max_memory_buffer;
} scan_args_t;
scan_args_t *scan_args_create();

View File

@@ -202,9 +202,8 @@ void delete_queue(int max) {
Indexer->line_head = tmp->next;
if (Indexer->line_head == NULL) {
Indexer->line_tail = NULL;
} else {
free(tmp);
}
free(tmp);
Indexer->queued -= 1;
}
}

File diff suppressed because one or more lines are too long

View File

@@ -22,7 +22,7 @@
#define EPILOG "Made by simon987 <me@simon987.net>. Released under GPL-3.0"
static const char *const Version = "2.3.0";
static const char *const Version = "2.4.2";
static const char *const usage[] = {
"sist2 scan [OPTION]... PATH",
"sist2 index [OPTION]... INDEX",
@@ -127,6 +127,7 @@ void initialize_scan_context(scan_args_t *args) {
ScanCtx.media_ctx.log = _log;
ScanCtx.media_ctx.logf = _logf;
ScanCtx.media_ctx.store = _store;
ScanCtx.media_ctx.max_media_buffer = (long) args->max_memory_buffer * 1024 * 1024;
init_media();
// OOXML
@@ -357,7 +358,10 @@ int main(int argc, const char *argv[]) {
OPT_STRING('e', "exclude", &scan_args->exclude_regex, "Files that match this regex will not be scanned"),
OPT_BOOLEAN(0, "fast", &scan_args->fast, "Only index file names & mime type"),
OPT_STRING(0, "treemap-threshold", &scan_args->treemap_threshold_str, "Relative size threshold for treemap "
"(see USAGE.md). DEFAULT: 0.0005"),
"(see USAGE.md). DEFAULT: 0.0005"),
OPT_INTEGER(0, "mem-buffer", &scan_args->max_memory_buffer,
"Maximum memory buffer size per thread in MB for files inside archives "
"(see USAGE.md). DEFAULT: 2000"),
OPT_GROUP("Index options"),
OPT_STRING(0, "es-url", &common_es_url, "Elasticsearch url with port. DEFAULT=http://localhost:9200"),

View File

@@ -885,6 +885,7 @@ g_hash_table_insert(ext_table, "frl", (gpointer)application_freeloader);
g_hash_table_insert(ext_table, "spl", (gpointer)application_futuresplash);
g_hash_table_insert(ext_table, "vew", (gpointer)application_groupwise);
g_hash_table_insert(ext_table, "gz", (gpointer)application_gzip);
g_hash_table_insert(ext_table, "tgz", (gpointer)application_gzip);
g_hash_table_insert(ext_table, "hta", (gpointer)application_hta);
g_hash_table_insert(ext_table, "unv", (gpointer)application_i_deas);
g_hash_table_insert(ext_table, "iges", (gpointer)application_iges);
@@ -1387,6 +1388,7 @@ g_hash_table_insert(ext_table, "divx", (gpointer)video_x_msvideo);
g_hash_table_insert(ext_table, "qtc", (gpointer)video_x_qtc);
g_hash_table_insert(ext_table, "movie", (gpointer)video_x_sgi_movie);
g_hash_table_insert(ext_table, "mv", (gpointer)video_x_sgi_movie);
g_hash_table_insert(ext_table, "msg", (gpointer)application_vnd_ms_outlook);
return ext_table;}
GHashTable *mime_get_mime_table() {GHashTable *mime_table = g_hash_table_new(g_str_hash, g_str_equal);
g_hash_table_insert(mime_table, "application/arj", (gpointer)application_arj);

View File

@@ -78,6 +78,11 @@ void parse(void *arg) {
if (doc.mime == 0 && !ScanCtx.fast) {
// Get mime type with libmagic
if (!job->vfile.is_fs_file) {
LOG_WARNING(job->filepath, "Guessing mime type with libmagic inside archive files is not currently supported");
goto abort;
}
bytes_read = job->vfile.read(&job->vfile, buf, MAGIC_BUF_SIZE);
if (bytes_read < 0) {
@@ -147,14 +152,13 @@ void parse(void *arg) {
parse_mobi(&ScanCtx.mobi_ctx, &job->vfile, &doc);
}
abort:
//Parent meta
if (!uuid_is_null(job->parent)) {
char tmp[UUID_STR_LEN];
uuid_unparse(job->parent, tmp);
meta_line_t *meta_parent = malloc(sizeof(meta_line_t) + UUID_STR_LEN + 1);
meta_parent->key = MetaParent;
strcpy(meta_parent->str_val, tmp);
uuid_unparse(job->parent, meta_parent->str_val);
APPEND_META((&doc), meta_parent)
}

4
src/static/css/autocomplete.min.css vendored Normal file
View File

@@ -0,0 +1,4 @@
.autocomplete-suggestions { text-align: left; cursor: default; border: 1px solid #ccc; border-top: 0; background: #fff; box-shadow: -1px 1px 3px rgba(0,0,0,.1); position: absolute; display: none; z-index: 9999; max-height: 254px; overflow: hidden; overflow-y: auto; box-sizing: border-box; }
.autocomplete-suggestion { position: relative; padding: 0 .6em; line-height: 23px; white-space: nowrap; overflow: hidden; text-overflow: ellipsis; font-size: 1.02em; color: #333; }
.autocomplete-suggestion b { font-weight: normal; color: #1f8dd6; }
.autocomplete-suggestion.selected { background: #f0f0f0; }

View File

@@ -266,6 +266,7 @@ mark {
margin: 3px;
white-space: normal;
color: rgb(224, 224, 224);
overflow: hidden;
}
.irs-single, .irs-from, .irs-to {

View File

@@ -205,6 +205,7 @@ mark {
margin: 3px;
white-space: normal;
color: #000;
overflow: hidden;
}
.irs-single, .irs-from, .irs-to {

3
src/static/js/auto-complete.min.js vendored Normal file

File diff suppressed because one or more lines are too long

View File

@@ -27,18 +27,12 @@ function gifOver(thumbnail, hit) {
}
function getContentHighlight(hit) {
const re = RegExp(/<mark>/g);
const sortByMathCount = (a, b) => {
return b.match(re).length - a.match(re).length;
};
if (hit.hasOwnProperty("highlight")) {
if (hit["highlight"].hasOwnProperty("content")) {
return hit["highlight"]["content"].sort(sortByMathCount)[0];
return hit["highlight"]["content"][0];
} else if (hit["highlight"].hasOwnProperty("content.nGram")) {
return hit["highlight"]["content.nGram"].sort(sortByMathCount)[0];
return hit["highlight"]["content.nGram"][0];
}
}
@@ -77,6 +71,7 @@ function shouldPlayVideo(hit) {
return mime &&
mime.startsWith("video/") &&
!("parent" in hit["_source"]) &&
hit["_source"]["extension"] !== "mkv" &&
hit["_source"]["extension"] !== "avi" &&
videoc !== "hevc" &&

View File

@@ -74,6 +74,41 @@ function showEsError() {
window.onload = () => {
CONF.load();
new autoComplete({
selector: '#pathBar',
minChars: 1,
delay: 400,
renderItem: function (item) {
return '<div class="autocomplete-suggestion" data-val="' + item + '">' + item + '</div>';
},
source: async function (term, suggest) {
if (!CONF.options.suggestPath) {
return []
}
term = term.toLowerCase();
const choices = await getPathChoices();
let matches = [];
for (let i = 0; i < choices.length; i++) {
if (~choices[i].toLowerCase().indexOf(term)) {
matches.push(choices[i]);
}
}
suggest(matches.sort());
},
onSelect: function () {
searchDebounced();
}
});
searchBar.addEventListener("keyup", searchDebounced);
pathBar.addEventListener("keyup", e => {
if (e.key === "Enter") {
searchDebounced();
}
});
};
function toggleFuzzy() {
@@ -105,10 +140,7 @@ $.jsonPost("i").then(resp => {
});
function getDocumentInfo(id) {
return $.getJSON("d/" + id).fail(e => {
console.log(e);
showEsError();
})
return $.getJSON("d/" + id).fail(showEsError)
}
function handleTreeClick(tree) {
@@ -332,24 +364,24 @@ function search(after = null) {
let path = pathBar.value.replace(/\/$/, "").toLowerCase(); //remove trailing slashes
if (path !== "") {
filters.push([{term: {path: path}}])
filters.push({term: {path: path}})
}
let mimeTypes = getSelectedNodes(mimeTree);
if (!mimeTypes.includes("any")) {
filters.push([{terms: {"mime": mimeTypes}}]);
filters.push({terms: {"mime": mimeTypes}});
}
let tags = getSelectedNodes(tagTree);
if (!tags.includes("any")) {
filters.push([{terms: {"tag": tags}}]);
filters.push({terms: {"tag": tags}});
}
if (date_min && date_max) {
filters.push([{range: {mtime: {gte: date_min, lte: date_max}}}])
filters.push({range: {mtime: {gte: date_min, lte: date_max}}})
} else if (date_min) {
filters.push([{range: {mtime: {gte: date_min}}}])
filters.push({range: {mtime: {gte: date_min}}})
} else if (date_max) {
filters.push([{range: {mtime: {lte: date_max}}}])
filters.push({range: {mtime: {lte: date_max}}})
}
let q = {
@@ -385,6 +417,9 @@ function search(after = null) {
q.highlight = {
pre_tags: ["<mark>"],
post_tags: ["</mark>"],
fragment_size: CONF.options.fragmentSize,
number_of_fragments: 1,
order: "score",
fields: {
content: {},
// "content.nGram": {},
@@ -441,8 +476,6 @@ let searchDebounced = _.debounce(function () {
search()
}, 500);
searchBar.addEventListener("keyup", searchDebounced);
pathBar.addEventListener("keyup", searchDebounced);
//Size slider
$("#sizeSlider").ionRangeSlider({
@@ -607,7 +640,8 @@ function createPathTree(target) {
let pathTree = new InspireTree({
data: function (node, resolve, reject) {
return getNextDepth(node);
}
},
sort: "text"
});
selectedIndices.forEach(index => {
@@ -627,3 +661,19 @@ function createPathTree(target) {
pathTree.on("node.click", handlePathTreeClick(pathTree));
}
function getPathChoices() {
return new Promise(getPaths => {
$.jsonPost("es", {
suggest: {
path: {
prefix: pathBar.value,
completion: {
field: "suggest-path",
skip_duplicates: true,
size: 10000
}
}
}
}).then(resp => getPaths(resp["suggest"]["path"][0]["options"].map(opt => opt["_source"]["path"])));
})
}

View File

@@ -100,6 +100,8 @@ const _defaults = {
treemapGroupingDepth: 3,
treemapColor: "PuBuGn",
treemapSize: "large",
suggestPath: true,
fragmentSize: 100
};
function loadSettings() {
@@ -114,6 +116,8 @@ function loadSettings() {
$("#settingTreemapColor").val(CONF.options.treemapColor);
$("#settingTreemapSize").val(CONF.options.treemapSize);
$("#settingTreemapType").val(CONF.options.treemapType);
$("#settingSuggestPath").prop("checked", CONF.options.suggestPath);
$("#settingFragmentSize").val(CONF.options.fragmentSize);
}
function Settings() {
@@ -155,6 +159,8 @@ function updateSettings() {
CONF.options.treemapColor = $("#settingTreemapColor").val();
CONF.options.treemapSize = $("#settingTreemapSize").val();
CONF.options.treemapType = $("#settingTreemapType").val();
CONF.options.suggestPath = $("#settingSuggestPath").prop("checked");
CONF.options.fragmentSize = $("#settingFragmentSize").val();
CONF.save();
if (typeof searchDebounced !== "undefined") {

View File

@@ -11,7 +11,7 @@
<nav class="navbar navbar-expand-lg">
<a class="navbar-brand" href="/">sist2</a>
<span class="badge badge-pill version">2.3.0</span>
<span class="badge badge-pill version">2.4.2</span>
<span class="tagline">Lightning-fast file system indexer and search tool </span>
<a class="btn ml-auto" href="/stats">Stats</a>
<button class="btn" type="button" data-toggle="modal" data-target="#settings" onclick="loadSettings()">Settings</button>
@@ -192,6 +192,17 @@
<label class="custom-control-label" for="settingSearchInPath">Enable matching query against document path</label>
</div>
<div class="custom-control custom-checkbox">
<input type="checkbox" class="custom-control-input" id="settingSuggestPath">
<label class="custom-control-label" for="settingSuggestPath">Enable auto-complete in path filter bar</label>
</div>
<br/>
<div class="form-group">
<input type="number" class="form-control" id="settingFragmentSize">
<label for="settingFragmentSize">Highlight context size in characters</label>
</div>
<label for="settingDisplay">Display</label>
<select id="settingDisplay" class="form-control form-control-sm">
<option value="grid">Grid</option>

View File

@@ -10,7 +10,7 @@
<nav class="navbar navbar-expand-lg">
<a class="navbar-brand" href="/">sist2</a>
<span class="badge badge-pill version">2.3.0</span>
<span class="badge badge-pill version">2.4.2</span>
<span class="tagline">Lightning-fast file system indexer and search tool </span>
<a style="margin-left: auto" class="btn" href="/">Back</a>
<button class="btn" type="button" data-toggle="modal" data-target="#settings"
@@ -77,6 +77,17 @@
path</label>
</div>
<div class="custom-control custom-checkbox">
<input type="checkbox" class="custom-control-input" id="settingSuggestPath">
<label class="custom-control-label" for="settingSuggestPath">Enable auto-complete in path filter bar</label>
</div>
<br/>
<div class="form-group">
<input type="number" class="form-control" id="settingFragmentSize">
<label for="settingFragmentSize">Highlight context size in characters</label>
</div>
<label for="settingDisplay">Display</label>
<select id="settingDisplay" class="form-control form-control-sm">
<option value="grid">Grid</option>

View File

@@ -24,6 +24,10 @@ typedef struct {
void fill_tables(cJSON *document, UNUSED(const char uuid_str[UUID_STR_LEN])) {
if (cJSON_GetObjectItem(document, "parent") != NULL) {
return;
}
const char *json_path = cJSON_GetObjectItem(document, "path")->valuestring;
char *path = malloc(strlen(json_path) + 1);
strcpy(path, json_path);
@@ -167,7 +171,7 @@ int merge_up(double thresh) {
int size = g_hash_table_size(FlatTree);
LOG_DEBUGF("stats.h", "Merge up iteration (%d merged, %d in tree)", count, size)
LOG_DEBUGF("stats.c", "Merge up iteration (%d merged, %d in tree)", count, size)
return count;
}
@@ -184,9 +188,9 @@ void csv_escape(char *dst, const char *str) {
return;
}
while (*ptr++ != 0) {
char c = *ptr;
*out++ = '"';
char c;
while ((c = *ptr++) != 0) {
if (c == '"') {
*out++ = '"';
*out++ = '"';
@@ -194,6 +198,8 @@ void csv_escape(char *dst, const char *str) {
*out++ = c;
}
}
*out++ = '"';
*out = '\0';
}
int open_or_exit(const char *path) {

View File

@@ -26,10 +26,11 @@ dyn_buffer_t url_escape(char *str) {
}
char *abspath(const char *path) {
wordexp_t w;
wordexp(path, &w, 0);
char *abs = realpath(w.we_wordv[0], NULL);
char *expanded = expandpath(path);
char *abs = realpath(expanded, NULL);
free(expanded);
if (abs == NULL) {
return NULL;
}
@@ -38,16 +39,46 @@ char *abspath(const char *path) {
strcat(abs, "/");
}
wordfree(&w);
return abs;
}
char *expandpath(const char *path) {
wordexp_t w;
wordexp(path, &w, 0);
void shell_escape(char *dst, const char *src) {
const char *ptr = src;
char *out = dst;
while ((*ptr)) {
char c = *ptr++;
char *expanded = malloc(strlen(w.we_wordv[0]) + 2);
strcpy(expanded, w.we_wordv[0]);
if (c == '&' || c == '\n' || c == '|' || c == ';' || c == '<' ||
c == '>' || c == '(' || c == ')' || c == '{' || c == '}') {
*out++ = '\\';
}
*out++ = c;
}
*out = 0;
}
char *expandpath(const char *path) {
char tmp[PATH_MAX * 2];
shell_escape(tmp, path);
wordexp_t w;
wordexp(tmp, &w, 0);
if (w.we_wordv == NULL) {
return NULL;
}
*tmp = '\0';
for (int i = 0; i < w.we_wordc; i++) {
strcat(tmp, w.we_wordv[i]);
if (i != w.we_wordc - 1) {
strcat(tmp, " ");
}
}
char *expanded = malloc(strlen(tmp) + 2);
strcpy(expanded, tmp);
strcat(expanded, "/");
wordfree(&w);
@@ -152,7 +183,7 @@ void str_escape(char *dst, const char *str) {
break;
}
cur += sprintf(cur, "%c%02X", ESCAPE_CHAR, (unsigned char)tmp[i]);
cur += sprintf(cur, "%c%02X", ESCAPE_CHAR, (unsigned char) tmp[i]);
}
continue;
}
@@ -198,12 +229,12 @@ void str_unescape(char *dst, const char *str) {
char next = *ptr;
if (next == ESCAPE_CHAR) {
*cur++ = (char)c;
*cur++ = (char) c;
ptr += 1;
} else {
tmp[0] = *(ptr);
tmp[1] = *(ptr + 1);
*cur++ = (char)strtol(tmp, NULL, 16);
*cur++ = (char) strtol(tmp, NULL, 16);
ptr += 2;
}
} else {

File diff suppressed because one or more lines are too long