Compare commits

..

3 Commits

Author SHA1 Message Date
7c46ad632a Update readme 2021-06-11 20:44:47 -04:00
5b8c13fd13 Handle GPS metadata in the UI 2021-06-11 20:41:05 -04:00
efa4a06e56 Fix meta_key UB problem 2021-06-11 20:19:36 -04:00
9 changed files with 99 additions and 44 deletions

View File

@ -25,14 +25,12 @@ sist2 (Simple incremental search tool)
* OCR support with tesseract \*\*\* * OCR support with tesseract \*\*\*
* Stats page & disk utilisation visualization * Stats page & disk utilisation visualization
\* See [format support](#format-support) \* See [format support](#format-support)
\*\* See [Archive files](#archive-files) \*\* See [Archive files](#archive-files)
\*\*\* See [OCR](#ocr) \*\*\* See [OCR](#ocr)
![stats](docs/stats.png) ![stats](docs/stats.png)
## Getting Started ## Getting Started
1. Have an Elasticsearch (>= 6.X.X) instance running 1. Have an Elasticsearch (>= 6.X.X) instance running
@ -57,10 +55,8 @@ sist2 (Simple incremental search tool)
1. See [Usage guide](docs/USAGE.md) 1. See [Usage guide](docs/USAGE.md)
\* *Windows users*: **sist2** runs under [WSL](https://en.wikipedia.org/wiki/Windows_Subsystem_for_Linux) \* *Windows users*: **sist2** runs under [WSL](https://en.wikipedia.org/wiki/Windows_Subsystem_for_Linux)
## Example usage ## Example usage
See [Usage guide](docs/USAGE.md) for more details See [Usage guide](docs/USAGE.md) for more details
@ -69,7 +65,6 @@ See [Usage guide](docs/USAGE.md) for more details
1. Push index to Elasticsearch: `sist2 index ./docs_idx` 1. Push index to Elasticsearch: `sist2 index ./docs_idx`
1. Start web interface: `sist2 web ./docs_idx` 1. Start web interface: `sist2 web ./docs_idx`
## Format support ## Format support
File type | Library | Content | Thumbnail | Metadata File type | Library | Content | Thumbnail | Metadata
@ -78,8 +73,8 @@ pdf,xps,fb2,epub | MuPDF | text+ocr | yes | author, title |
cbz,cbr | *(none)* | - | yes | - | cbz,cbr | *(none)* | - | yes | - |
`audio/*` | ffmpeg | - | yes | ID3 tags | `audio/*` | ffmpeg | - | yes | ID3 tags |
`video/*` | ffmpeg | - | yes | title, comment, artist | `video/*` | ffmpeg | - | yes | title, comment, artist |
`image/*` | ffmpeg | - | yes | [Common EXIF tags](https://github.com/simon987/sist2/blob/efdde2734eca9b14a54f84568863b7ffd59bdba3/src/parsing/media.c#L190) | `image/*` | ffmpeg | - | yes | [Common EXIF tags](https://github.com/simon987/sist2/blob/efdde2734eca9b14a54f84568863b7ffd59bdba3/src/parsing/media.c#L190), GPS tags |
raw, rw2, dng, cr2, crw, dcr, k25, kdc, mrw, pef, xf3, arw, sr2, srf, erf | LibRaw | - | yes | Common EXIF tags | raw, rw2, dng, cr2, crw, dcr, k25, kdc, mrw, pef, xf3, arw, sr2, srf, erf | LibRaw | - | yes | Common EXIF tags, GPS tags |
ttf,ttc,cff,woff,fnt,otf | Freetype2 | - | yes, `bmp` | Name & style | ttf,ttc,cff,woff,fnt,otf | Freetype2 | - | yes, `bmp` | Name & style |
`text/plain` | *(none)* | yes | no | - | `text/plain` | *(none)* | yes | no | - |
html, xml | *(none)* | yes | no | - | html, xml | *(none)* | yes | no | - |
@ -91,38 +86,37 @@ mobi, azw, azw3 | libmobi | yes | no | author, title |
\* *See [Archive files](#archive-files)* \* *See [Archive files](#archive-files)*
### Archive files ### Archive files
**sist2** will scan files stored into archive files (zip, tar, 7z...) as if
they were directly in the file system. Recursive (archives inside archives) **sist2** will scan files stored into archive files (zip, tar, 7z...) as if they were directly in the file system.
Recursive (archives inside archives)
scan is also supported. scan is also supported.
**Limitations**: **Limitations**:
* Support for parsing media files with formats that require *seek* (e.g. `.gif`, `.mp4` w/ fragmented metadata etc.) * Support for parsing media files with formats that require *seek* (e.g. `.gif`, `.mp4` w/ fragmented metadata etc.)
is limitted (see `--mem-buffer` option) is limitted (see `--mem-buffer` option)
* Archive files are scanned sequentially, by a single thread. On systems where * Archive files are scanned sequentially, by a single thread. On systems where
**sist2** is not I/O bound, scans might be faster when larger archives are split **sist2** is not I/O bound, scans might be faster when larger archives are split into smaller parts.
into smaller parts.
### OCR ### OCR
You can enable OCR support for pdf,xps,fb2,epub file types with the You can enable OCR support for pdf,xps,fb2,epub file types with the
`--ocr <lang>` option. Download the language data files with your `--ocr <lang>` option. Download the language data files with your package manager (`apt install tesseract-ocr-eng`) or
package manager (`apt install tesseract-ocr-eng`) or directly [from Github](https://github.com/tesseract-ocr/tesseract/wiki/Data-Files). directly [from Github](https://github.com/tesseract-ocr/tesseract/wiki/Data-Files).
The `simon987/sist2` image comes with common languages The `simon987/sist2` image comes with common languages
(hin, jpn, eng, fra, rus, spa) pre-installed. (hin, jpn, eng, fra, rus, spa) pre-installed.
Examples Examples
```bash ```bash
sist2 scan --ocr jpn ~/Books/Manga/ sist2 scan --ocr jpn ~/Books/Manga/
sist2 scan --ocr eng ~/Books/Textbooks/ sist2 scan --ocr eng ~/Books/Textbooks/
``` ```
## Build from source ## Build from source
You can compile **sist2** by yourself if you don't want to use the pre-compiled You can compile **sist2** by yourself if you don't want to use the pre-compiled binaries (GCC 7+ required).
binaries (GCC 7+ required).
1. Install compile-time dependencies 1. Install compile-time dependencies

View File

@ -165,6 +165,30 @@
"exif_user_comment": { "exif_user_comment": {
"type": "text" "type": "text"
}, },
"exif_gps_longitude_ref": {
"type": "keyword",
"index": false
},
"exif_gps_longitude_dms": {
"type": "keyword",
"index": false
},
"exif_gps_longitude_dec": {
"type": "keyword",
"index": false
},
"exif_gps_latitude_ref": {
"type": "keyword",
"index": false
},
"exif_gps_latitude_dms": {
"type": "keyword",
"index": false
},
"exif_gps_latitude_dec": {
"type": "keyword",
"index": false
},
"author": { "author": {
"type": "text" "type": "text"
}, },

File diff suppressed because one or more lines are too long

View File

@ -18,7 +18,7 @@ typedef struct {
#define META_NEXT 0xFFFF #define META_NEXT 0xFFFF
void skip_meta(FILE *file) { void skip_meta(FILE *file) {
enum metakey key; enum metakey key = 0;
fread(&key, sizeof(uint16_t), 1, file); fread(&key, sizeof(uint16_t), 1, file);
while (key != META_NEXT) { while (key != META_NEXT) {
@ -237,7 +237,7 @@ void read_index_bin(const char *path, const char *index_id, index_func func) {
FILE *file = fopen(path, "rb"); FILE *file = fopen(path, "rb");
while (TRUE) { while (TRUE) {
buf.cur = 0; buf.cur = 0;
size_t _ = fread((void *) &line, 1, sizeof(line_t), file); size_t _ = fread((void *) &line, sizeof(line_t), 1, file);
if (feof(file)) { if (feof(file)) {
break; break;
} }
@ -284,8 +284,8 @@ void read_index_bin(const char *path, const char *index_id, index_func func) {
cJSON_AddStringToObject(document, "path", ""); cJSON_AddStringToObject(document, "path", "");
} }
enum metakey key; enum metakey key = 0;
fread(&key, sizeof(short), 1, file); fread(&key, sizeof(uint16_t), 1, file);
size_t ret; size_t ret;
while (key != META_NEXT) { while (key != META_NEXT) {
switch (key) { switch (key) {
@ -481,7 +481,7 @@ void incremental_read(GHashTable *table, const char *filepath) {
incremental_put(table, line.path_md5, line.mtime); incremental_put(table, line.path_md5, line.mtime);
while ((getc(file))) {} while ((getc(file)) != 0) {}
skip_meta(file); skip_meta(file);
} }
fclose(file); fclose(file);
@ -531,7 +531,7 @@ void incremental_copy(store_t *store, store_t *dst_store, const char *filepath,
free(buf); free(buf);
} }
enum metakey key; enum metakey key = 0;
while (1) { while (1) {
fread(&key, sizeof(uint16_t), 1, file); fread(&key, sizeof(uint16_t), 1, file);
fwrite(&key, sizeof(uint16_t), 1, dst_file); fwrite(&key, sizeof(uint16_t), 1, dst_file);

View File

@ -192,6 +192,19 @@ function makeUserTag(tag, hit) {
return userTag; return userTag;
} }
function makeGpsMetaRow(tbody, latitude, longitude) {
tbody.append($("<tr>")
.append($("<td>").text("Exif GPS"))
.append($("<td>")
.append($("<a>")
.text(`${latitude}, ${longitude}`)
.attr("href", `https://maps.google.com/?q=${latitude},${longitude}&ll=${latitude},${longitude}&t=k&z=17`)
.attr("target", "_blank")
)
)
);
}
function infoButtonCb(hit) { function infoButtonCb(hit) {
return () => { return () => {
getDocumentInfo(hit["_id"]).then(doc => { getDocumentInfo(hit["_id"]).then(doc => {
@ -229,13 +242,25 @@ function infoButtonCb(hit) {
.text(new Date(doc["mtime"] * 1000).toISOString().split(".")[0].replace("T", " ")) .text(new Date(doc["mtime"] * 1000).toISOString().split(".")[0].replace("T", " "))
.attr("title", doc["mtime"])) .attr("title", doc["mtime"]))
); );
// Exif GPS
if ("exif_gps_longitude_dec" in doc) {
makeGpsMetaRow(tbody, doc["exif_gps_latitude_dec"], doc["exif_gps_longitude_dec"])
} else if ("exif_gps_longitude_dms" in doc) {
makeGpsMetaRow(
tbody,
dmsToDecimal(doc["exif_gps_latitude_dms"], doc["exif_gps_latitude_ref"]),
dmsToDecimal(doc["exif_gps_longitude_dms"], doc["exif_gps_longitude_ref"]),
)
}
const displayFields = new Set([ const displayFields = new Set([
"mime", "size", "path", "title", "width", "height", "duration", "audioc", "videoc", "mime", "size", "path", "title", "width", "height", "duration", "audioc", "videoc",
"bitrate", "artist", "album", "album_artist", "genre", "title", "font_name", "tag", "author", "bitrate", "artist", "album", "album_artist", "genre", "title", "font_name", "tag", "author",
"modified_by", "pages" "modified_by", "pages"
]); ]);
Object.keys(doc) Object.keys(doc)
.filter(key => key.startsWith("_keyword.") || key.startsWith("_text.") || displayFields.has(key) || key.startsWith("exif_")) .filter(key => key.startsWith("_keyword.") || key.startsWith("_text.") || displayFields.has(key) || (key.startsWith("exif_") && !key.includes("gps")))
.forEach(key => { .forEach(key => {
tbody.append($("<tr>") tbody.append($("<tr>")
.append($("<td>").text(key)) .append($("<td>").text(key))
@ -352,7 +377,7 @@ function createDocCard(hit) {
audio.setAttribute("src", "f/" + hit["_id"]); audio.setAttribute("src", "f/" + hit["_id"]);
audio.addEventListener("play", () => { audio.addEventListener("play", () => {
// Pause all currently playing audio tags // Pause all currently playing audio tags
$("audio").each(function(){ $("audio").each(function () {
if (this !== audio) { if (this !== audio) {
this.pause(); this.pause();
} }

View File

@ -236,3 +236,13 @@ function updateColumnStyle() {
` `
} }
} }
function dmsToDecimal(dms, ref) {
const tokens = dms.split(",")
const d = Number(tokens[0].trim().split(":")[0]) / Number(tokens[0].trim().split(":")[1])
const m = Number(tokens[1].trim().split(":")[0]) / Number(tokens[1].trim().split(":")[1])
const s = Number(tokens[2].trim().split(":")[0]) / Number(tokens[2].trim().split(":")[1])
return (d + (m / 60) + (s / 3600)) * (ref === "S" || ref === "W" ? -1 : 1)
}

File diff suppressed because one or more lines are too long

View File

@ -17,17 +17,19 @@ def copy_files(files):
def sist2(*args): def sist2(*args):
print("./sist2 " + " ".join(args))
return subprocess.check_output( return subprocess.check_output(
args=["./sist2_debug", *args], args=["./sist2", *args],
) )
def sist2_index(files, *args): def sist2_index(files, *args):
path = copy_files(files) path = copy_files(files)
shutil.rmtree("i", ignore_errors=True) shutil.rmtree("test_i", ignore_errors=True)
sist2("scan", path, "-o", "i", *args) sist2("scan", path, "-o", "test_i", *args)
return iter(sist2_index_to_dict("i")) return iter(sist2_index_to_dict("test_i"))
def sist2_incremental_index(files, func=None, *args): def sist2_incremental_index(files, func=None, *args):
@ -36,14 +38,14 @@ def sist2_incremental_index(files, func=None, *args):
if func: if func:
func(path) func(path)
shutil.rmtree("i_inc", ignore_errors=True) shutil.rmtree("test_i_inc", ignore_errors=True)
sist2("scan", path, "-o", "i_inc", "--incremental", "i", *args) sist2("scan", path, "-o", "test_i_inc", "--incremental", "test_i", *args)
return iter(sist2_index_to_dict("i_inc")) return iter(sist2_index_to_dict("test_i_inc"))
def sist2_index_to_dict(index): def sist2_index_to_dict(index):
res = subprocess.check_output( res = subprocess.check_output(
args=["./sist2_debug", "index", "--print", index], args=["./sist2", "index", "--print", index],
) )
for line in res.splitlines(): for line in res.splitlines():

2
third-party/libscan vendored

@ -1 +1 @@
Subproject commit 598e748214fe0656d536e40bb9e056c058504d85 Subproject commit 9be4f02851107edac65894a1fdde16a80cad43ac