Compare commits

...

3 Commits

Author SHA1 Message Date
7c46ad632a Update readme 2021-06-11 20:44:47 -04:00
5b8c13fd13 Handle GPS metadata in the UI 2021-06-11 20:41:05 -04:00
efa4a06e56 Fix meta_key UB problem 2021-06-11 20:19:36 -04:00
9 changed files with 99 additions and 44 deletions

View File

@ -25,14 +25,12 @@ sist2 (Simple incremental search tool)
* OCR support with tesseract \*\*\*
* Stats page & disk utilisation visualization
\* See [format support](#format-support)
\*\* See [Archive files](#archive-files)
\*\*\* See [OCR](#ocr)
\*\*\* See [OCR](#ocr)
![stats](docs/stats.png)
## Getting Started
1. Have an Elasticsearch (>= 6.X.X) instance running
@ -56,10 +54,8 @@ sist2 (Simple incremental search tool)
1. *(or)* `docker pull simon987/sist2:latest`
1. See [Usage guide](docs/USAGE.md)
\* *Windows users*: **sist2** runs under [WSL](https://en.wikipedia.org/wiki/Windows_Subsystem_for_Linux)
\* *Windows users*: **sist2** runs under [WSL](https://en.wikipedia.org/wiki/Windows_Subsystem_for_Linux)
## Example usage
@ -69,7 +65,6 @@ See [Usage guide](docs/USAGE.md) for more details
1. Push index to Elasticsearch: `sist2 index ./docs_idx`
1. Start web interface: `sist2 web ./docs_idx`
## Format support
File type | Library | Content | Thumbnail | Metadata
@ -78,8 +73,8 @@ pdf,xps,fb2,epub | MuPDF | text+ocr | yes | author, title |
cbz,cbr | *(none)* | - | yes | - |
`audio/*` | ffmpeg | - | yes | ID3 tags |
`video/*` | ffmpeg | - | yes | title, comment, artist |
`image/*` | ffmpeg | - | yes | [Common EXIF tags](https://github.com/simon987/sist2/blob/efdde2734eca9b14a54f84568863b7ffd59bdba3/src/parsing/media.c#L190) |
raw, rw2, dng, cr2, crw, dcr, k25, kdc, mrw, pef, xf3, arw, sr2, srf, erf | LibRaw | - | yes | Common EXIF tags |
`image/*` | ffmpeg | - | yes | [Common EXIF tags](https://github.com/simon987/sist2/blob/efdde2734eca9b14a54f84568863b7ffd59bdba3/src/parsing/media.c#L190), GPS tags |
raw, rw2, dng, cr2, crw, dcr, k25, kdc, mrw, pef, xf3, arw, sr2, srf, erf | LibRaw | - | yes | Common EXIF tags, GPS tags |
ttf,ttc,cff,woff,fnt,otf | Freetype2 | - | yes, `bmp` | Name & style |
`text/plain` | *(none)* | yes | no | - |
html, xml | *(none)* | yes | no | - |
@ -89,40 +84,39 @@ doc (MS Word 97-2003) | antiword | yes | yes | author, title |
mobi, azw, azw3 | libmobi | yes | no | author, title |
\* *See [Archive files](#archive-files)*
### Archive files
**sist2** will scan files stored into archive files (zip, tar, 7z...) as if
they were directly in the file system. Recursive (archives inside archives)
**sist2** will scan files stored into archive files (zip, tar, 7z...) as if they were directly in the file system.
Recursive (archives inside archives)
scan is also supported.
**Limitations**:
* Support for parsing media files with formats that require *seek* (e.g. `.gif`, `.mp4` w/ fragmented metadata etc.)
* Support for parsing media files with formats that require *seek* (e.g. `.gif`, `.mp4` w/ fragmented metadata etc.)
is limitted (see `--mem-buffer` option)
* Archive files are scanned sequentially, by a single thread. On systems where
**sist2** is not I/O bound, scans might be faster when larger archives are split
into smaller parts.
**sist2** is not I/O bound, scans might be faster when larger archives are split into smaller parts.
### OCR
You can enable OCR support for pdf,xps,fb2,epub file types with the
`--ocr <lang>` option. Download the language data files with your
package manager (`apt install tesseract-ocr-eng`) or directly [from Github](https://github.com/tesseract-ocr/tesseract/wiki/Data-Files).
`--ocr <lang>` option. Download the language data files with your package manager (`apt install tesseract-ocr-eng`) or
directly [from Github](https://github.com/tesseract-ocr/tesseract/wiki/Data-Files).
The `simon987/sist2` image comes with common languages
The `simon987/sist2` image comes with common languages
(hin, jpn, eng, fra, rus, spa) pre-installed.
Examples
```bash
sist2 scan --ocr jpn ~/Books/Manga/
sist2 scan --ocr eng ~/Books/Textbooks/
```
## Build from source
You can compile **sist2** by yourself if you don't want to use the pre-compiled
binaries (GCC 7+ required).
You can compile **sist2** by yourself if you don't want to use the pre-compiled binaries (GCC 7+ required).
1. Install compile-time dependencies

View File

@ -105,10 +105,10 @@
"analyzer": "my_nGram",
"type": "text"
},
"_keyword.*": {
"_keyword.*": {
"type": "keyword"
},
"_text.*": {
"_text.*": {
"analyzer": "content_analyzer",
"type": "text",
"fields": {
@ -165,6 +165,30 @@
"exif_user_comment": {
"type": "text"
},
"exif_gps_longitude_ref": {
"type": "keyword",
"index": false
},
"exif_gps_longitude_dms": {
"type": "keyword",
"index": false
},
"exif_gps_longitude_dec": {
"type": "keyword",
"index": false
},
"exif_gps_latitude_ref": {
"type": "keyword",
"index": false
},
"exif_gps_latitude_dms": {
"type": "keyword",
"index": false
},
"exif_gps_latitude_dec": {
"type": "keyword",
"index": false
},
"author": {
"type": "text"
},

File diff suppressed because one or more lines are too long

View File

@ -18,7 +18,7 @@ typedef struct {
#define META_NEXT 0xFFFF
void skip_meta(FILE *file) {
enum metakey key;
enum metakey key = 0;
fread(&key, sizeof(uint16_t), 1, file);
while (key != META_NEXT) {
@ -237,7 +237,7 @@ void read_index_bin(const char *path, const char *index_id, index_func func) {
FILE *file = fopen(path, "rb");
while (TRUE) {
buf.cur = 0;
size_t _ = fread((void *) &line, 1, sizeof(line_t), file);
size_t _ = fread((void *) &line, sizeof(line_t), 1, file);
if (feof(file)) {
break;
}
@ -284,8 +284,8 @@ void read_index_bin(const char *path, const char *index_id, index_func func) {
cJSON_AddStringToObject(document, "path", "");
}
enum metakey key;
fread(&key, sizeof(short), 1, file);
enum metakey key = 0;
fread(&key, sizeof(uint16_t), 1, file);
size_t ret;
while (key != META_NEXT) {
switch (key) {
@ -481,7 +481,7 @@ void incremental_read(GHashTable *table, const char *filepath) {
incremental_put(table, line.path_md5, line.mtime);
while ((getc(file))) {}
while ((getc(file)) != 0) {}
skip_meta(file);
}
fclose(file);
@ -531,7 +531,7 @@ void incremental_copy(store_t *store, store_t *dst_store, const char *filepath,
free(buf);
}
enum metakey key;
enum metakey key = 0;
while (1) {
fread(&key, sizeof(uint16_t), 1, file);
fwrite(&key, sizeof(uint16_t), 1, dst_file);

View File

@ -192,6 +192,19 @@ function makeUserTag(tag, hit) {
return userTag;
}
function makeGpsMetaRow(tbody, latitude, longitude) {
tbody.append($("<tr>")
.append($("<td>").text("Exif GPS"))
.append($("<td>")
.append($("<a>")
.text(`${latitude}, ${longitude}`)
.attr("href", `https://maps.google.com/?q=${latitude},${longitude}&ll=${latitude},${longitude}&t=k&z=17`)
.attr("target", "_blank")
)
)
);
}
function infoButtonCb(hit) {
return () => {
getDocumentInfo(hit["_id"]).then(doc => {
@ -229,13 +242,25 @@ function infoButtonCb(hit) {
.text(new Date(doc["mtime"] * 1000).toISOString().split(".")[0].replace("T", " "))
.attr("title", doc["mtime"]))
);
// Exif GPS
if ("exif_gps_longitude_dec" in doc) {
makeGpsMetaRow(tbody, doc["exif_gps_latitude_dec"], doc["exif_gps_longitude_dec"])
} else if ("exif_gps_longitude_dms" in doc) {
makeGpsMetaRow(
tbody,
dmsToDecimal(doc["exif_gps_latitude_dms"], doc["exif_gps_latitude_ref"]),
dmsToDecimal(doc["exif_gps_longitude_dms"], doc["exif_gps_longitude_ref"]),
)
}
const displayFields = new Set([
"mime", "size", "path", "title", "width", "height", "duration", "audioc", "videoc",
"bitrate", "artist", "album", "album_artist", "genre", "title", "font_name", "tag", "author",
"modified_by", "pages"
]);
Object.keys(doc)
.filter(key => key.startsWith("_keyword.") || key.startsWith("_text.") || displayFields.has(key) || key.startsWith("exif_"))
.filter(key => key.startsWith("_keyword.") || key.startsWith("_text.") || displayFields.has(key) || (key.startsWith("exif_") && !key.includes("gps")))
.forEach(key => {
tbody.append($("<tr>")
.append($("<td>").text(key))
@ -352,7 +377,7 @@ function createDocCard(hit) {
audio.setAttribute("src", "f/" + hit["_id"]);
audio.addEventListener("play", () => {
// Pause all currently playing audio tags
$("audio").each(function(){
$("audio").each(function () {
if (this !== audio) {
this.pause();
}

View File

@ -235,4 +235,14 @@ function updateColumnStyle() {
}
`
}
}
function dmsToDecimal(dms, ref) {
const tokens = dms.split(",")
const d = Number(tokens[0].trim().split(":")[0]) / Number(tokens[0].trim().split(":")[1])
const m = Number(tokens[1].trim().split(":")[0]) / Number(tokens[1].trim().split(":")[1])
const s = Number(tokens[2].trim().split(":")[0]) / Number(tokens[2].trim().split(":")[1])
return (d + (m / 60) + (s / 3600)) * (ref === "S" || ref === "W" ? -1 : 1)
}

File diff suppressed because one or more lines are too long

View File

@ -17,17 +17,19 @@ def copy_files(files):
def sist2(*args):
print("./sist2 " + " ".join(args))
return subprocess.check_output(
args=["./sist2_debug", *args],
args=["./sist2", *args],
)
def sist2_index(files, *args):
path = copy_files(files)
shutil.rmtree("i", ignore_errors=True)
sist2("scan", path, "-o", "i", *args)
return iter(sist2_index_to_dict("i"))
shutil.rmtree("test_i", ignore_errors=True)
sist2("scan", path, "-o", "test_i", *args)
return iter(sist2_index_to_dict("test_i"))
def sist2_incremental_index(files, func=None, *args):
@ -36,14 +38,14 @@ def sist2_incremental_index(files, func=None, *args):
if func:
func(path)
shutil.rmtree("i_inc", ignore_errors=True)
sist2("scan", path, "-o", "i_inc", "--incremental", "i", *args)
return iter(sist2_index_to_dict("i_inc"))
shutil.rmtree("test_i_inc", ignore_errors=True)
sist2("scan", path, "-o", "test_i_inc", "--incremental", "test_i", *args)
return iter(sist2_index_to_dict("test_i_inc"))
def sist2_index_to_dict(index):
res = subprocess.check_output(
args=["./sist2_debug", "index", "--print", index],
args=["./sist2", "index", "--print", index],
)
for line in res.splitlines():

2
third-party/libscan vendored

@ -1 +1 @@
Subproject commit 598e748214fe0656d536e40bb9e056c058504d85
Subproject commit 9be4f02851107edac65894a1fdde16a80cad43ac