mirror of
https://github.com/simon987/sist2.git
synced 2025-04-18 01:36:42 +00:00
Compare commits
3 Commits
81670ee107
...
7c46ad632a
Author | SHA1 | Date | |
---|---|---|---|
7c46ad632a | |||
5b8c13fd13 | |||
efa4a06e56 |
40
README.md
40
README.md
@ -25,14 +25,12 @@ sist2 (Simple incremental search tool)
|
||||
* OCR support with tesseract \*\*\*
|
||||
* Stats page & disk utilisation visualization
|
||||
|
||||
|
||||
\* See [format support](#format-support)
|
||||
\*\* See [Archive files](#archive-files)
|
||||
\*\*\* See [OCR](#ocr)
|
||||
\*\*\* See [OCR](#ocr)
|
||||
|
||||

|
||||
|
||||
|
||||
## Getting Started
|
||||
|
||||
1. Have an Elasticsearch (>= 6.X.X) instance running
|
||||
@ -56,10 +54,8 @@ sist2 (Simple incremental search tool)
|
||||
1. *(or)* `docker pull simon987/sist2:latest`
|
||||
|
||||
1. See [Usage guide](docs/USAGE.md)
|
||||
|
||||
|
||||
\* *Windows users*: **sist2** runs under [WSL](https://en.wikipedia.org/wiki/Windows_Subsystem_for_Linux)
|
||||
|
||||
\* *Windows users*: **sist2** runs under [WSL](https://en.wikipedia.org/wiki/Windows_Subsystem_for_Linux)
|
||||
|
||||
## Example usage
|
||||
|
||||
@ -69,7 +65,6 @@ See [Usage guide](docs/USAGE.md) for more details
|
||||
1. Push index to Elasticsearch: `sist2 index ./docs_idx`
|
||||
1. Start web interface: `sist2 web ./docs_idx`
|
||||
|
||||
|
||||
## Format support
|
||||
|
||||
File type | Library | Content | Thumbnail | Metadata
|
||||
@ -78,8 +73,8 @@ pdf,xps,fb2,epub | MuPDF | text+ocr | yes | author, title |
|
||||
cbz,cbr | *(none)* | - | yes | - |
|
||||
`audio/*` | ffmpeg | - | yes | ID3 tags |
|
||||
`video/*` | ffmpeg | - | yes | title, comment, artist |
|
||||
`image/*` | ffmpeg | - | yes | [Common EXIF tags](https://github.com/simon987/sist2/blob/efdde2734eca9b14a54f84568863b7ffd59bdba3/src/parsing/media.c#L190) |
|
||||
raw, rw2, dng, cr2, crw, dcr, k25, kdc, mrw, pef, xf3, arw, sr2, srf, erf | LibRaw | - | yes | Common EXIF tags |
|
||||
`image/*` | ffmpeg | - | yes | [Common EXIF tags](https://github.com/simon987/sist2/blob/efdde2734eca9b14a54f84568863b7ffd59bdba3/src/parsing/media.c#L190), GPS tags |
|
||||
raw, rw2, dng, cr2, crw, dcr, k25, kdc, mrw, pef, xf3, arw, sr2, srf, erf | LibRaw | - | yes | Common EXIF tags, GPS tags |
|
||||
ttf,ttc,cff,woff,fnt,otf | Freetype2 | - | yes, `bmp` | Name & style |
|
||||
`text/plain` | *(none)* | yes | no | - |
|
||||
html, xml | *(none)* | yes | no | - |
|
||||
@ -89,40 +84,39 @@ doc (MS Word 97-2003) | antiword | yes | yes | author, title |
|
||||
mobi, azw, azw3 | libmobi | yes | no | author, title |
|
||||
|
||||
\* *See [Archive files](#archive-files)*
|
||||
|
||||
|
||||
### Archive files
|
||||
**sist2** will scan files stored into archive files (zip, tar, 7z...) as if
|
||||
they were directly in the file system. Recursive (archives inside archives)
|
||||
|
||||
**sist2** will scan files stored into archive files (zip, tar, 7z...) as if they were directly in the file system.
|
||||
Recursive (archives inside archives)
|
||||
scan is also supported.
|
||||
|
||||
**Limitations**:
|
||||
* Support for parsing media files with formats that require *seek* (e.g. `.gif`, `.mp4` w/ fragmented metadata etc.)
|
||||
|
||||
* Support for parsing media files with formats that require *seek* (e.g. `.gif`, `.mp4` w/ fragmented metadata etc.)
|
||||
is limitted (see `--mem-buffer` option)
|
||||
* Archive files are scanned sequentially, by a single thread. On systems where
|
||||
**sist2** is not I/O bound, scans might be faster when larger archives are split
|
||||
into smaller parts.
|
||||
|
||||
|
||||
**sist2** is not I/O bound, scans might be faster when larger archives are split into smaller parts.
|
||||
|
||||
### OCR
|
||||
|
||||
You can enable OCR support for pdf,xps,fb2,epub file types with the
|
||||
`--ocr <lang>` option. Download the language data files with your
|
||||
package manager (`apt install tesseract-ocr-eng`) or directly [from Github](https://github.com/tesseract-ocr/tesseract/wiki/Data-Files).
|
||||
`--ocr <lang>` option. Download the language data files with your package manager (`apt install tesseract-ocr-eng`) or
|
||||
directly [from Github](https://github.com/tesseract-ocr/tesseract/wiki/Data-Files).
|
||||
|
||||
The `simon987/sist2` image comes with common languages
|
||||
The `simon987/sist2` image comes with common languages
|
||||
(hin, jpn, eng, fra, rus, spa) pre-installed.
|
||||
|
||||
Examples
|
||||
|
||||
```bash
|
||||
sist2 scan --ocr jpn ~/Books/Manga/
|
||||
sist2 scan --ocr eng ~/Books/Textbooks/
|
||||
```
|
||||
|
||||
|
||||
## Build from source
|
||||
|
||||
You can compile **sist2** by yourself if you don't want to use the pre-compiled
|
||||
binaries (GCC 7+ required).
|
||||
You can compile **sist2** by yourself if you don't want to use the pre-compiled binaries (GCC 7+ required).
|
||||
|
||||
1. Install compile-time dependencies
|
||||
|
||||
|
@ -105,10 +105,10 @@
|
||||
"analyzer": "my_nGram",
|
||||
"type": "text"
|
||||
},
|
||||
"_keyword.*": {
|
||||
"_keyword.*": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"_text.*": {
|
||||
"_text.*": {
|
||||
"analyzer": "content_analyzer",
|
||||
"type": "text",
|
||||
"fields": {
|
||||
@ -165,6 +165,30 @@
|
||||
"exif_user_comment": {
|
||||
"type": "text"
|
||||
},
|
||||
"exif_gps_longitude_ref": {
|
||||
"type": "keyword",
|
||||
"index": false
|
||||
},
|
||||
"exif_gps_longitude_dms": {
|
||||
"type": "keyword",
|
||||
"index": false
|
||||
},
|
||||
"exif_gps_longitude_dec": {
|
||||
"type": "keyword",
|
||||
"index": false
|
||||
},
|
||||
"exif_gps_latitude_ref": {
|
||||
"type": "keyword",
|
||||
"index": false
|
||||
},
|
||||
"exif_gps_latitude_dms": {
|
||||
"type": "keyword",
|
||||
"index": false
|
||||
},
|
||||
"exif_gps_latitude_dec": {
|
||||
"type": "keyword",
|
||||
"index": false
|
||||
},
|
||||
"author": {
|
||||
"type": "text"
|
||||
},
|
||||
|
File diff suppressed because one or more lines are too long
@ -18,7 +18,7 @@ typedef struct {
|
||||
#define META_NEXT 0xFFFF
|
||||
|
||||
void skip_meta(FILE *file) {
|
||||
enum metakey key;
|
||||
enum metakey key = 0;
|
||||
fread(&key, sizeof(uint16_t), 1, file);
|
||||
|
||||
while (key != META_NEXT) {
|
||||
@ -237,7 +237,7 @@ void read_index_bin(const char *path, const char *index_id, index_func func) {
|
||||
FILE *file = fopen(path, "rb");
|
||||
while (TRUE) {
|
||||
buf.cur = 0;
|
||||
size_t _ = fread((void *) &line, 1, sizeof(line_t), file);
|
||||
size_t _ = fread((void *) &line, sizeof(line_t), 1, file);
|
||||
if (feof(file)) {
|
||||
break;
|
||||
}
|
||||
@ -284,8 +284,8 @@ void read_index_bin(const char *path, const char *index_id, index_func func) {
|
||||
cJSON_AddStringToObject(document, "path", "");
|
||||
}
|
||||
|
||||
enum metakey key;
|
||||
fread(&key, sizeof(short), 1, file);
|
||||
enum metakey key = 0;
|
||||
fread(&key, sizeof(uint16_t), 1, file);
|
||||
size_t ret;
|
||||
while (key != META_NEXT) {
|
||||
switch (key) {
|
||||
@ -481,7 +481,7 @@ void incremental_read(GHashTable *table, const char *filepath) {
|
||||
|
||||
incremental_put(table, line.path_md5, line.mtime);
|
||||
|
||||
while ((getc(file))) {}
|
||||
while ((getc(file)) != 0) {}
|
||||
skip_meta(file);
|
||||
}
|
||||
fclose(file);
|
||||
@ -531,7 +531,7 @@ void incremental_copy(store_t *store, store_t *dst_store, const char *filepath,
|
||||
free(buf);
|
||||
}
|
||||
|
||||
enum metakey key;
|
||||
enum metakey key = 0;
|
||||
while (1) {
|
||||
fread(&key, sizeof(uint16_t), 1, file);
|
||||
fwrite(&key, sizeof(uint16_t), 1, dst_file);
|
||||
|
@ -192,6 +192,19 @@ function makeUserTag(tag, hit) {
|
||||
return userTag;
|
||||
}
|
||||
|
||||
function makeGpsMetaRow(tbody, latitude, longitude) {
|
||||
tbody.append($("<tr>")
|
||||
.append($("<td>").text("Exif GPS"))
|
||||
.append($("<td>")
|
||||
.append($("<a>")
|
||||
.text(`${latitude}, ${longitude}`)
|
||||
.attr("href", `https://maps.google.com/?q=${latitude},${longitude}&ll=${latitude},${longitude}&t=k&z=17`)
|
||||
.attr("target", "_blank")
|
||||
)
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
function infoButtonCb(hit) {
|
||||
return () => {
|
||||
getDocumentInfo(hit["_id"]).then(doc => {
|
||||
@ -229,13 +242,25 @@ function infoButtonCb(hit) {
|
||||
.text(new Date(doc["mtime"] * 1000).toISOString().split(".")[0].replace("T", " "))
|
||||
.attr("title", doc["mtime"]))
|
||||
);
|
||||
|
||||
// Exif GPS
|
||||
if ("exif_gps_longitude_dec" in doc) {
|
||||
makeGpsMetaRow(tbody, doc["exif_gps_latitude_dec"], doc["exif_gps_longitude_dec"])
|
||||
} else if ("exif_gps_longitude_dms" in doc) {
|
||||
makeGpsMetaRow(
|
||||
tbody,
|
||||
dmsToDecimal(doc["exif_gps_latitude_dms"], doc["exif_gps_latitude_ref"]),
|
||||
dmsToDecimal(doc["exif_gps_longitude_dms"], doc["exif_gps_longitude_ref"]),
|
||||
)
|
||||
}
|
||||
|
||||
const displayFields = new Set([
|
||||
"mime", "size", "path", "title", "width", "height", "duration", "audioc", "videoc",
|
||||
"bitrate", "artist", "album", "album_artist", "genre", "title", "font_name", "tag", "author",
|
||||
"modified_by", "pages"
|
||||
]);
|
||||
Object.keys(doc)
|
||||
.filter(key => key.startsWith("_keyword.") || key.startsWith("_text.") || displayFields.has(key) || key.startsWith("exif_"))
|
||||
.filter(key => key.startsWith("_keyword.") || key.startsWith("_text.") || displayFields.has(key) || (key.startsWith("exif_") && !key.includes("gps")))
|
||||
.forEach(key => {
|
||||
tbody.append($("<tr>")
|
||||
.append($("<td>").text(key))
|
||||
@ -352,7 +377,7 @@ function createDocCard(hit) {
|
||||
audio.setAttribute("src", "f/" + hit["_id"]);
|
||||
audio.addEventListener("play", () => {
|
||||
// Pause all currently playing audio tags
|
||||
$("audio").each(function(){
|
||||
$("audio").each(function () {
|
||||
if (this !== audio) {
|
||||
this.pause();
|
||||
}
|
||||
|
@ -235,4 +235,14 @@ function updateColumnStyle() {
|
||||
}
|
||||
`
|
||||
}
|
||||
}
|
||||
|
||||
function dmsToDecimal(dms, ref) {
|
||||
const tokens = dms.split(",")
|
||||
|
||||
const d = Number(tokens[0].trim().split(":")[0]) / Number(tokens[0].trim().split(":")[1])
|
||||
const m = Number(tokens[1].trim().split(":")[0]) / Number(tokens[1].trim().split(":")[1])
|
||||
const s = Number(tokens[2].trim().split(":")[0]) / Number(tokens[2].trim().split(":")[1])
|
||||
|
||||
return (d + (m / 60) + (s / 3600)) * (ref === "S" || ref === "W" ? -1 : 1)
|
||||
}
|
File diff suppressed because one or more lines are too long
@ -17,17 +17,19 @@ def copy_files(files):
|
||||
|
||||
|
||||
def sist2(*args):
|
||||
print("./sist2 " + " ".join(args))
|
||||
|
||||
return subprocess.check_output(
|
||||
args=["./sist2_debug", *args],
|
||||
args=["./sist2", *args],
|
||||
)
|
||||
|
||||
|
||||
def sist2_index(files, *args):
|
||||
path = copy_files(files)
|
||||
|
||||
shutil.rmtree("i", ignore_errors=True)
|
||||
sist2("scan", path, "-o", "i", *args)
|
||||
return iter(sist2_index_to_dict("i"))
|
||||
shutil.rmtree("test_i", ignore_errors=True)
|
||||
sist2("scan", path, "-o", "test_i", *args)
|
||||
return iter(sist2_index_to_dict("test_i"))
|
||||
|
||||
|
||||
def sist2_incremental_index(files, func=None, *args):
|
||||
@ -36,14 +38,14 @@ def sist2_incremental_index(files, func=None, *args):
|
||||
if func:
|
||||
func(path)
|
||||
|
||||
shutil.rmtree("i_inc", ignore_errors=True)
|
||||
sist2("scan", path, "-o", "i_inc", "--incremental", "i", *args)
|
||||
return iter(sist2_index_to_dict("i_inc"))
|
||||
shutil.rmtree("test_i_inc", ignore_errors=True)
|
||||
sist2("scan", path, "-o", "test_i_inc", "--incremental", "test_i", *args)
|
||||
return iter(sist2_index_to_dict("test_i_inc"))
|
||||
|
||||
|
||||
def sist2_index_to_dict(index):
|
||||
res = subprocess.check_output(
|
||||
args=["./sist2_debug", "index", "--print", index],
|
||||
args=["./sist2", "index", "--print", index],
|
||||
)
|
||||
|
||||
for line in res.splitlines():
|
||||
|
2
third-party/libscan
vendored
2
third-party/libscan
vendored
@ -1 +1 @@
|
||||
Subproject commit 598e748214fe0656d536e40bb9e056c058504d85
|
||||
Subproject commit 9be4f02851107edac65894a1fdde16a80cad43ac
|
Loading…
x
Reference in New Issue
Block a user