Update readme

Handle GPS metadata in the UI
Fix meta_key UB problem
2025-12-14 07:49:06 +00:00 · 2021-06-11 20:44:47 -04:00 · 2021-06-11 20:41:05 -04:00 · 2021-06-11 20:19:36 -04:00
9 changed files with 99 additions and 44 deletions
--- a/README.md
+++ b/README.md
@@ -25,14 +25,12 @@ sist2 (Simple incremental search tool)
 * OCR support with tesseract \*\*\*
 * Stats page & disk utilisation visualization

-
 \* See [format support](#format-support)    
 \*\* See [Archive files](#archive-files)    
-\*\*\* See [OCR](#ocr)    
+\*\*\* See [OCR](#ocr)

 ![stats](docs/stats.png)

-
 ## Getting Started

 1. Have an Elasticsearch (>= 6.X.X) instance running
@@ -56,10 +54,8 @@ sist2 (Simple incremental search tool)
    1. *(or)* `docker pull simon987/sist2:latest`

 1. See [Usage guide](docs/USAGE.md)
-   
-
-\* *Windows users*: **sist2** runs under [WSL](https://en.wikipedia.org/wiki/Windows_Subsystem_for_Linux)    

+\* *Windows users*: **sist2** runs under [WSL](https://en.wikipedia.org/wiki/Windows_Subsystem_for_Linux)

 ## Example usage

@@ -69,7 +65,6 @@ See [Usage guide](docs/USAGE.md) for more details
 1. Push index to Elasticsearch: `sist2 index ./docs_idx`
 1. Start web interface: `sist2 web ./docs_idx`

-
 ## Format support

 File type | Library | Content | Thumbnail | Metadata
@@ -78,8 +73,8 @@ pdf,xps,fb2,epub | MuPDF | text+ocr | yes | author, title |
 cbz,cbr | *(none)* | - | yes | - |
 `audio/*` | ffmpeg | - | yes | ID3 tags |
 `video/*` | ffmpeg | - | yes | title, comment, artist |
-`image/*` | ffmpeg | - | yes | [Common EXIF tags](https://github.com/simon987/sist2/blob/efdde2734eca9b14a54f84568863b7ffd59bdba3/src/parsing/media.c#L190) |
-raw, rw2, dng, cr2, crw, dcr, k25, kdc, mrw, pef, xf3, arw, sr2, srf, erf  | LibRaw | - | yes | Common EXIF tags |
+`image/*` | ffmpeg | - | yes | [Common EXIF tags](https://github.com/simon987/sist2/blob/efdde2734eca9b14a54f84568863b7ffd59bdba3/src/parsing/media.c#L190), GPS tags |
+raw, rw2, dng, cr2, crw, dcr, k25, kdc, mrw, pef, xf3, arw, sr2, srf, erf  | LibRaw | - | yes | Common EXIF tags, GPS tags |
 ttf,ttc,cff,woff,fnt,otf | Freetype2 | - | yes, `bmp` | Name & style |
 `text/plain` | *(none)* | yes | no | - |
 html, xml | *(none)* | yes | no | - |
@@ -89,40 +84,39 @@ doc (MS Word 97-2003) | antiword | yes | yes | author, title |
 mobi, azw, azw3 | libmobi | yes | no | author, title |

 \* *See [Archive files](#archive-files)*
- 
+
 ### Archive files
-**sist2** will scan files stored into archive files (zip, tar, 7z...) as if
-they were directly in the file system. Recursive (archives inside archives)
+
+**sist2** will scan files stored into archive files (zip, tar, 7z...) as if they were directly in the file system.
+Recursive (archives inside archives)
 scan is also supported.

 **Limitations**:
-* Support for parsing media files with formats that require *seek* (e.g. `.gif`, `.mp4` w/ fragmented metadata etc.) 
+
+* Support for parsing media files with formats that require *seek* (e.g. `.gif`, `.mp4` w/ fragmented metadata etc.)
  is limitted (see `--mem-buffer` option)
 * Archive files are scanned sequentially, by a single thread. On systems where
-**sist2** is not I/O bound, scans might be faster when larger archives are split
- into smaller parts.
- 
- 
+  **sist2** is not I/O bound, scans might be faster when larger archives are split into smaller parts.
+
 ### OCR

 You can enable OCR support for pdf,xps,fb2,epub file types with the
-`--ocr <lang>` option. Download the language data files with your
-package manager (`apt install tesseract-ocr-eng`) or directly [from Github](https://github.com/tesseract-ocr/tesseract/wiki/Data-Files).
+`--ocr <lang>` option. Download the language data files with your package manager (`apt install tesseract-ocr-eng`) or
+directly [from Github](https://github.com/tesseract-ocr/tesseract/wiki/Data-Files).

-The `simon987/sist2` image comes with common languages 
+The `simon987/sist2` image comes with common languages
 (hin, jpn, eng, fra, rus, spa) pre-installed.

 Examples
+
 ```bash
 sist2 scan --ocr jpn ~/Books/Manga/
 sist2 scan --ocr eng ~/Books/Textbooks/
 ```

-
 ## Build from source

-You can compile **sist2** by yourself if you don't want to use the pre-compiled
-binaries (GCC 7+ required).
+You can compile **sist2** by yourself if you don't want to use the pre-compiled binaries (GCC 7+ required).

 1. Install compile-time dependencies

--- a/schema/mappings.json
+++ b/schema/mappings.json
@@ -105,10 +105,10 @@
      "analyzer": "my_nGram",
      "type": "text"
    },
-    "_keyword.*":  {
+    "_keyword.*": {
      "type": "keyword"
    },
-    "_text.*":  {
+    "_text.*": {
      "analyzer": "content_analyzer",
      "type": "text",
      "fields": {
@@ -165,6 +165,30 @@
    "exif_user_comment": {
      "type": "text"
    },
+    "exif_gps_longitude_ref": {
+      "type": "keyword",
+      "index": false
+    },
+    "exif_gps_longitude_dms": {
+      "type": "keyword",
+      "index": false
+    },
+    "exif_gps_longitude_dec": {
+      "type": "keyword",
+      "index": false
+    },
+    "exif_gps_latitude_ref": {
+      "type": "keyword",
+      "index": false
+    },
+    "exif_gps_latitude_dms": {
+      "type": "keyword",
+      "index": false
+    },
+    "exif_gps_latitude_dec": {
+      "type": "keyword",
+      "index": false
+    },
    "author": {
      "type": "text"
    },
--- a/src/index/static_generated.c
+++ b/src/index/static_generated.c
--- a/src/io/serialize.c
+++ b/src/io/serialize.c
@@ -18,7 +18,7 @@ typedef struct {
 #define META_NEXT 0xFFFF

 void skip_meta(FILE *file) {
-    enum metakey key;
+    enum metakey key = 0;
    fread(&key, sizeof(uint16_t), 1, file);

    while (key != META_NEXT) {
@@ -237,7 +237,7 @@ void read_index_bin(const char *path, const char *index_id, index_func func) {
    FILE *file = fopen(path, "rb");
    while (TRUE) {
        buf.cur = 0;
-        size_t _ = fread((void *) &line, 1, sizeof(line_t), file);
+        size_t _ = fread((void *) &line, sizeof(line_t), 1, file);
        if (feof(file)) {
            break;
        }
@@ -284,8 +284,8 @@ void read_index_bin(const char *path, const char *index_id, index_func func) {
            cJSON_AddStringToObject(document, "path", "");
        }

-        enum metakey key;
-        fread(&key, sizeof(short), 1, file);
+        enum metakey key = 0;
+        fread(&key, sizeof(uint16_t), 1, file);
        size_t ret;
        while (key != META_NEXT) {
            switch (key) {
@@ -481,7 +481,7 @@ void incremental_read(GHashTable *table, const char *filepath) {

        incremental_put(table, line.path_md5, line.mtime);

-        while ((getc(file))) {}
+        while ((getc(file)) != 0) {}
        skip_meta(file);
    }
    fclose(file);
@@ -531,7 +531,7 @@ void incremental_copy(store_t *store, store_t *dst_store, const char *filepath,
                free(buf);
            }

-            enum metakey key;
+            enum metakey key = 0;
            while (1) {
                fread(&key, sizeof(uint16_t), 1, file);
                fwrite(&key, sizeof(uint16_t), 1, dst_file);
--- a/src/static/js/dom.js
+++ b/src/static/js/dom.js
@@ -192,6 +192,19 @@ function makeUserTag(tag, hit) {
    return userTag;
 }

+function makeGpsMetaRow(tbody, latitude, longitude) {
+    tbody.append($("<tr>")
+        .append($("<td>").text("Exif GPS"))
+        .append($("<td>")
+            .append($("<a>")
+                .text(`${latitude}, ${longitude}`)
+                .attr("href", `https://maps.google.com/?q=${latitude},${longitude}&ll=${latitude},${longitude}&t=k&z=17`)
+                .attr("target", "_blank")
+            )
+        )
+    );
+}
+
 function infoButtonCb(hit) {
    return () => {
        getDocumentInfo(hit["_id"]).then(doc => {
@@ -229,13 +242,25 @@ function infoButtonCb(hit) {
                    .text(new Date(doc["mtime"] * 1000).toISOString().split(".")[0].replace("T", " "))
                    .attr("title", doc["mtime"]))
            );
+
+            // Exif GPS
+            if ("exif_gps_longitude_dec" in doc) {
+                makeGpsMetaRow(tbody, doc["exif_gps_latitude_dec"], doc["exif_gps_longitude_dec"])
+            } else if ("exif_gps_longitude_dms" in doc) {
+                makeGpsMetaRow(
+                    tbody,
+                    dmsToDecimal(doc["exif_gps_latitude_dms"], doc["exif_gps_latitude_ref"]),
+                    dmsToDecimal(doc["exif_gps_longitude_dms"], doc["exif_gps_longitude_ref"]),
+                )
+            }
+
            const displayFields = new Set([
                "mime", "size", "path", "title", "width", "height", "duration", "audioc", "videoc",
                "bitrate", "artist", "album", "album_artist", "genre", "title", "font_name", "tag", "author",
                "modified_by", "pages"
            ]);
            Object.keys(doc)
-                .filter(key => key.startsWith("_keyword.") || key.startsWith("_text.") || displayFields.has(key) || key.startsWith("exif_"))
+                .filter(key => key.startsWith("_keyword.") || key.startsWith("_text.") || displayFields.has(key) || (key.startsWith("exif_") && !key.includes("gps")))
                .forEach(key => {
                    tbody.append($("<tr>")
                        .append($("<td>").text(key))
@@ -352,7 +377,7 @@ function createDocCard(hit) {
            audio.setAttribute("src", "f/" + hit["_id"]);
            audio.addEventListener("play", () => {
                // Pause all currently playing audio tags
-                $("audio").each(function(){
+                $("audio").each(function () {
                    if (this !== audio) {
                        this.pause();
                    }
--- a/src/static/js/util.js
+++ b/src/static/js/util.js
@@ -235,4 +235,14 @@ function updateColumnStyle() {
 }
        `
    }
+}
+
+function dmsToDecimal(dms, ref) {
+    const tokens = dms.split(",")
+
+    const d = Number(tokens[0].trim().split(":")[0]) / Number(tokens[0].trim().split(":")[1])
+    const m = Number(tokens[1].trim().split(":")[0]) / Number(tokens[1].trim().split(":")[1])
+    const s = Number(tokens[2].trim().split(":")[0]) / Number(tokens[2].trim().split(":")[1])
+
+    return (d + (m / 60) + (s / 3600)) * (ref === "S" || ref === "W" ? -1 : 1)
 }
--- a/src/web/static_generated.c
+++ b/src/web/static_generated.c
--- a/tests/test_scan.py
+++ b/tests/test_scan.py
@@ -17,17 +17,19 @@ def copy_files(files):


 def sist2(*args):
+    print("./sist2 " + " ".join(args))
+
    return subprocess.check_output(
-        args=["./sist2_debug", *args],
+        args=["./sist2", *args],
    )


 def sist2_index(files, *args):
    path = copy_files(files)

-    shutil.rmtree("i", ignore_errors=True)
-    sist2("scan", path, "-o", "i", *args)
-    return iter(sist2_index_to_dict("i"))
+    shutil.rmtree("test_i", ignore_errors=True)
+    sist2("scan", path, "-o", "test_i", *args)
+    return iter(sist2_index_to_dict("test_i"))


 def sist2_incremental_index(files, func=None, *args):
@@ -36,14 +38,14 @@ def sist2_incremental_index(files, func=None, *args):
    if func:
        func(path)

-    shutil.rmtree("i_inc", ignore_errors=True)
-    sist2("scan", path, "-o", "i_inc", "--incremental", "i", *args)
-    return iter(sist2_index_to_dict("i_inc"))
+    shutil.rmtree("test_i_inc", ignore_errors=True)
+    sist2("scan", path, "-o", "test_i_inc", "--incremental", "test_i", *args)
+    return iter(sist2_index_to_dict("test_i_inc"))


 def sist2_index_to_dict(index):
    res = subprocess.check_output(
-        args=["./sist2_debug", "index", "--print", index],
+        args=["./sist2", "index", "--print", index],
    )

    for line in res.splitlines():
--- a/third-party/libscan
+++ b/third-party/libscan
Author	SHA1	Message	Date
simon987	7c46ad632a	Update readme	2021-06-11 20:44:47 -04:00
simon987	5b8c13fd13	Handle GPS metadata in the UI	2021-06-11 20:41:05 -04:00
simon987	efa4a06e56	Fix meta_key UB problem	2021-06-11 20:19:36 -04:00