mirror of
https://github.com/simon987/sist2.git
synced 2025-04-21 19:26:45 +00:00
Compare commits
No commits in common. "7c46ad632af827e15986a531076cd657d0cf4243" and "81670ee10752bf44549958ce0cac43ca4dca737d" have entirely different histories.
7c46ad632a
...
81670ee107
28
README.md
28
README.md
@ -25,12 +25,14 @@ sist2 (Simple incremental search tool)
|
|||||||
* OCR support with tesseract \*\*\*
|
* OCR support with tesseract \*\*\*
|
||||||
* Stats page & disk utilisation visualization
|
* Stats page & disk utilisation visualization
|
||||||
|
|
||||||
|
|
||||||
\* See [format support](#format-support)
|
\* See [format support](#format-support)
|
||||||
\*\* See [Archive files](#archive-files)
|
\*\* See [Archive files](#archive-files)
|
||||||
\*\*\* See [OCR](#ocr)
|
\*\*\* See [OCR](#ocr)
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
|
|
||||||
## Getting Started
|
## Getting Started
|
||||||
|
|
||||||
1. Have an Elasticsearch (>= 6.X.X) instance running
|
1. Have an Elasticsearch (>= 6.X.X) instance running
|
||||||
@ -55,8 +57,10 @@ sist2 (Simple incremental search tool)
|
|||||||
|
|
||||||
1. See [Usage guide](docs/USAGE.md)
|
1. See [Usage guide](docs/USAGE.md)
|
||||||
|
|
||||||
|
|
||||||
\* *Windows users*: **sist2** runs under [WSL](https://en.wikipedia.org/wiki/Windows_Subsystem_for_Linux)
|
\* *Windows users*: **sist2** runs under [WSL](https://en.wikipedia.org/wiki/Windows_Subsystem_for_Linux)
|
||||||
|
|
||||||
|
|
||||||
## Example usage
|
## Example usage
|
||||||
|
|
||||||
See [Usage guide](docs/USAGE.md) for more details
|
See [Usage guide](docs/USAGE.md) for more details
|
||||||
@ -65,6 +69,7 @@ See [Usage guide](docs/USAGE.md) for more details
|
|||||||
1. Push index to Elasticsearch: `sist2 index ./docs_idx`
|
1. Push index to Elasticsearch: `sist2 index ./docs_idx`
|
||||||
1. Start web interface: `sist2 web ./docs_idx`
|
1. Start web interface: `sist2 web ./docs_idx`
|
||||||
|
|
||||||
|
|
||||||
## Format support
|
## Format support
|
||||||
|
|
||||||
File type | Library | Content | Thumbnail | Metadata
|
File type | Library | Content | Thumbnail | Metadata
|
||||||
@ -73,8 +78,8 @@ pdf,xps,fb2,epub | MuPDF | text+ocr | yes | author, title |
|
|||||||
cbz,cbr | *(none)* | - | yes | - |
|
cbz,cbr | *(none)* | - | yes | - |
|
||||||
`audio/*` | ffmpeg | - | yes | ID3 tags |
|
`audio/*` | ffmpeg | - | yes | ID3 tags |
|
||||||
`video/*` | ffmpeg | - | yes | title, comment, artist |
|
`video/*` | ffmpeg | - | yes | title, comment, artist |
|
||||||
`image/*` | ffmpeg | - | yes | [Common EXIF tags](https://github.com/simon987/sist2/blob/efdde2734eca9b14a54f84568863b7ffd59bdba3/src/parsing/media.c#L190), GPS tags |
|
`image/*` | ffmpeg | - | yes | [Common EXIF tags](https://github.com/simon987/sist2/blob/efdde2734eca9b14a54f84568863b7ffd59bdba3/src/parsing/media.c#L190) |
|
||||||
raw, rw2, dng, cr2, crw, dcr, k25, kdc, mrw, pef, xf3, arw, sr2, srf, erf | LibRaw | - | yes | Common EXIF tags, GPS tags |
|
raw, rw2, dng, cr2, crw, dcr, k25, kdc, mrw, pef, xf3, arw, sr2, srf, erf | LibRaw | - | yes | Common EXIF tags |
|
||||||
ttf,ttc,cff,woff,fnt,otf | Freetype2 | - | yes, `bmp` | Name & style |
|
ttf,ttc,cff,woff,fnt,otf | Freetype2 | - | yes, `bmp` | Name & style |
|
||||||
`text/plain` | *(none)* | yes | no | - |
|
`text/plain` | *(none)* | yes | no | - |
|
||||||
html, xml | *(none)* | yes | no | - |
|
html, xml | *(none)* | yes | no | - |
|
||||||
@ -86,37 +91,38 @@ mobi, azw, azw3 | libmobi | yes | no | author, title |
|
|||||||
\* *See [Archive files](#archive-files)*
|
\* *See [Archive files](#archive-files)*
|
||||||
|
|
||||||
### Archive files
|
### Archive files
|
||||||
|
**sist2** will scan files stored into archive files (zip, tar, 7z...) as if
|
||||||
**sist2** will scan files stored into archive files (zip, tar, 7z...) as if they were directly in the file system.
|
they were directly in the file system. Recursive (archives inside archives)
|
||||||
Recursive (archives inside archives)
|
|
||||||
scan is also supported.
|
scan is also supported.
|
||||||
|
|
||||||
**Limitations**:
|
**Limitations**:
|
||||||
|
|
||||||
* Support for parsing media files with formats that require *seek* (e.g. `.gif`, `.mp4` w/ fragmented metadata etc.)
|
* Support for parsing media files with formats that require *seek* (e.g. `.gif`, `.mp4` w/ fragmented metadata etc.)
|
||||||
is limitted (see `--mem-buffer` option)
|
is limitted (see `--mem-buffer` option)
|
||||||
* Archive files are scanned sequentially, by a single thread. On systems where
|
* Archive files are scanned sequentially, by a single thread. On systems where
|
||||||
**sist2** is not I/O bound, scans might be faster when larger archives are split into smaller parts.
|
**sist2** is not I/O bound, scans might be faster when larger archives are split
|
||||||
|
into smaller parts.
|
||||||
|
|
||||||
|
|
||||||
### OCR
|
### OCR
|
||||||
|
|
||||||
You can enable OCR support for pdf,xps,fb2,epub file types with the
|
You can enable OCR support for pdf,xps,fb2,epub file types with the
|
||||||
`--ocr <lang>` option. Download the language data files with your package manager (`apt install tesseract-ocr-eng`) or
|
`--ocr <lang>` option. Download the language data files with your
|
||||||
directly [from Github](https://github.com/tesseract-ocr/tesseract/wiki/Data-Files).
|
package manager (`apt install tesseract-ocr-eng`) or directly [from Github](https://github.com/tesseract-ocr/tesseract/wiki/Data-Files).
|
||||||
|
|
||||||
The `simon987/sist2` image comes with common languages
|
The `simon987/sist2` image comes with common languages
|
||||||
(hin, jpn, eng, fra, rus, spa) pre-installed.
|
(hin, jpn, eng, fra, rus, spa) pre-installed.
|
||||||
|
|
||||||
Examples
|
Examples
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
sist2 scan --ocr jpn ~/Books/Manga/
|
sist2 scan --ocr jpn ~/Books/Manga/
|
||||||
sist2 scan --ocr eng ~/Books/Textbooks/
|
sist2 scan --ocr eng ~/Books/Textbooks/
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
## Build from source
|
## Build from source
|
||||||
|
|
||||||
You can compile **sist2** by yourself if you don't want to use the pre-compiled binaries (GCC 7+ required).
|
You can compile **sist2** by yourself if you don't want to use the pre-compiled
|
||||||
|
binaries (GCC 7+ required).
|
||||||
|
|
||||||
1. Install compile-time dependencies
|
1. Install compile-time dependencies
|
||||||
|
|
||||||
|
@ -165,30 +165,6 @@
|
|||||||
"exif_user_comment": {
|
"exif_user_comment": {
|
||||||
"type": "text"
|
"type": "text"
|
||||||
},
|
},
|
||||||
"exif_gps_longitude_ref": {
|
|
||||||
"type": "keyword",
|
|
||||||
"index": false
|
|
||||||
},
|
|
||||||
"exif_gps_longitude_dms": {
|
|
||||||
"type": "keyword",
|
|
||||||
"index": false
|
|
||||||
},
|
|
||||||
"exif_gps_longitude_dec": {
|
|
||||||
"type": "keyword",
|
|
||||||
"index": false
|
|
||||||
},
|
|
||||||
"exif_gps_latitude_ref": {
|
|
||||||
"type": "keyword",
|
|
||||||
"index": false
|
|
||||||
},
|
|
||||||
"exif_gps_latitude_dms": {
|
|
||||||
"type": "keyword",
|
|
||||||
"index": false
|
|
||||||
},
|
|
||||||
"exif_gps_latitude_dec": {
|
|
||||||
"type": "keyword",
|
|
||||||
"index": false
|
|
||||||
},
|
|
||||||
"author": {
|
"author": {
|
||||||
"type": "text"
|
"type": "text"
|
||||||
},
|
},
|
||||||
|
File diff suppressed because one or more lines are too long
@ -18,7 +18,7 @@ typedef struct {
|
|||||||
#define META_NEXT 0xFFFF
|
#define META_NEXT 0xFFFF
|
||||||
|
|
||||||
void skip_meta(FILE *file) {
|
void skip_meta(FILE *file) {
|
||||||
enum metakey key = 0;
|
enum metakey key;
|
||||||
fread(&key, sizeof(uint16_t), 1, file);
|
fread(&key, sizeof(uint16_t), 1, file);
|
||||||
|
|
||||||
while (key != META_NEXT) {
|
while (key != META_NEXT) {
|
||||||
@ -237,7 +237,7 @@ void read_index_bin(const char *path, const char *index_id, index_func func) {
|
|||||||
FILE *file = fopen(path, "rb");
|
FILE *file = fopen(path, "rb");
|
||||||
while (TRUE) {
|
while (TRUE) {
|
||||||
buf.cur = 0;
|
buf.cur = 0;
|
||||||
size_t _ = fread((void *) &line, sizeof(line_t), 1, file);
|
size_t _ = fread((void *) &line, 1, sizeof(line_t), file);
|
||||||
if (feof(file)) {
|
if (feof(file)) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -284,8 +284,8 @@ void read_index_bin(const char *path, const char *index_id, index_func func) {
|
|||||||
cJSON_AddStringToObject(document, "path", "");
|
cJSON_AddStringToObject(document, "path", "");
|
||||||
}
|
}
|
||||||
|
|
||||||
enum metakey key = 0;
|
enum metakey key;
|
||||||
fread(&key, sizeof(uint16_t), 1, file);
|
fread(&key, sizeof(short), 1, file);
|
||||||
size_t ret;
|
size_t ret;
|
||||||
while (key != META_NEXT) {
|
while (key != META_NEXT) {
|
||||||
switch (key) {
|
switch (key) {
|
||||||
@ -481,7 +481,7 @@ void incremental_read(GHashTable *table, const char *filepath) {
|
|||||||
|
|
||||||
incremental_put(table, line.path_md5, line.mtime);
|
incremental_put(table, line.path_md5, line.mtime);
|
||||||
|
|
||||||
while ((getc(file)) != 0) {}
|
while ((getc(file))) {}
|
||||||
skip_meta(file);
|
skip_meta(file);
|
||||||
}
|
}
|
||||||
fclose(file);
|
fclose(file);
|
||||||
@ -531,7 +531,7 @@ void incremental_copy(store_t *store, store_t *dst_store, const char *filepath,
|
|||||||
free(buf);
|
free(buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
enum metakey key = 0;
|
enum metakey key;
|
||||||
while (1) {
|
while (1) {
|
||||||
fread(&key, sizeof(uint16_t), 1, file);
|
fread(&key, sizeof(uint16_t), 1, file);
|
||||||
fwrite(&key, sizeof(uint16_t), 1, dst_file);
|
fwrite(&key, sizeof(uint16_t), 1, dst_file);
|
||||||
|
@ -192,19 +192,6 @@ function makeUserTag(tag, hit) {
|
|||||||
return userTag;
|
return userTag;
|
||||||
}
|
}
|
||||||
|
|
||||||
function makeGpsMetaRow(tbody, latitude, longitude) {
|
|
||||||
tbody.append($("<tr>")
|
|
||||||
.append($("<td>").text("Exif GPS"))
|
|
||||||
.append($("<td>")
|
|
||||||
.append($("<a>")
|
|
||||||
.text(`${latitude}, ${longitude}`)
|
|
||||||
.attr("href", `https://maps.google.com/?q=${latitude},${longitude}&ll=${latitude},${longitude}&t=k&z=17`)
|
|
||||||
.attr("target", "_blank")
|
|
||||||
)
|
|
||||||
)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
function infoButtonCb(hit) {
|
function infoButtonCb(hit) {
|
||||||
return () => {
|
return () => {
|
||||||
getDocumentInfo(hit["_id"]).then(doc => {
|
getDocumentInfo(hit["_id"]).then(doc => {
|
||||||
@ -242,25 +229,13 @@ function infoButtonCb(hit) {
|
|||||||
.text(new Date(doc["mtime"] * 1000).toISOString().split(".")[0].replace("T", " "))
|
.text(new Date(doc["mtime"] * 1000).toISOString().split(".")[0].replace("T", " "))
|
||||||
.attr("title", doc["mtime"]))
|
.attr("title", doc["mtime"]))
|
||||||
);
|
);
|
||||||
|
|
||||||
// Exif GPS
|
|
||||||
if ("exif_gps_longitude_dec" in doc) {
|
|
||||||
makeGpsMetaRow(tbody, doc["exif_gps_latitude_dec"], doc["exif_gps_longitude_dec"])
|
|
||||||
} else if ("exif_gps_longitude_dms" in doc) {
|
|
||||||
makeGpsMetaRow(
|
|
||||||
tbody,
|
|
||||||
dmsToDecimal(doc["exif_gps_latitude_dms"], doc["exif_gps_latitude_ref"]),
|
|
||||||
dmsToDecimal(doc["exif_gps_longitude_dms"], doc["exif_gps_longitude_ref"]),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
const displayFields = new Set([
|
const displayFields = new Set([
|
||||||
"mime", "size", "path", "title", "width", "height", "duration", "audioc", "videoc",
|
"mime", "size", "path", "title", "width", "height", "duration", "audioc", "videoc",
|
||||||
"bitrate", "artist", "album", "album_artist", "genre", "title", "font_name", "tag", "author",
|
"bitrate", "artist", "album", "album_artist", "genre", "title", "font_name", "tag", "author",
|
||||||
"modified_by", "pages"
|
"modified_by", "pages"
|
||||||
]);
|
]);
|
||||||
Object.keys(doc)
|
Object.keys(doc)
|
||||||
.filter(key => key.startsWith("_keyword.") || key.startsWith("_text.") || displayFields.has(key) || (key.startsWith("exif_") && !key.includes("gps")))
|
.filter(key => key.startsWith("_keyword.") || key.startsWith("_text.") || displayFields.has(key) || key.startsWith("exif_"))
|
||||||
.forEach(key => {
|
.forEach(key => {
|
||||||
tbody.append($("<tr>")
|
tbody.append($("<tr>")
|
||||||
.append($("<td>").text(key))
|
.append($("<td>").text(key))
|
||||||
|
@ -236,13 +236,3 @@ function updateColumnStyle() {
|
|||||||
`
|
`
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function dmsToDecimal(dms, ref) {
|
|
||||||
const tokens = dms.split(",")
|
|
||||||
|
|
||||||
const d = Number(tokens[0].trim().split(":")[0]) / Number(tokens[0].trim().split(":")[1])
|
|
||||||
const m = Number(tokens[1].trim().split(":")[0]) / Number(tokens[1].trim().split(":")[1])
|
|
||||||
const s = Number(tokens[2].trim().split(":")[0]) / Number(tokens[2].trim().split(":")[1])
|
|
||||||
|
|
||||||
return (d + (m / 60) + (s / 3600)) * (ref === "S" || ref === "W" ? -1 : 1)
|
|
||||||
}
|
|
File diff suppressed because one or more lines are too long
@ -17,19 +17,17 @@ def copy_files(files):
|
|||||||
|
|
||||||
|
|
||||||
def sist2(*args):
|
def sist2(*args):
|
||||||
print("./sist2 " + " ".join(args))
|
|
||||||
|
|
||||||
return subprocess.check_output(
|
return subprocess.check_output(
|
||||||
args=["./sist2", *args],
|
args=["./sist2_debug", *args],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def sist2_index(files, *args):
|
def sist2_index(files, *args):
|
||||||
path = copy_files(files)
|
path = copy_files(files)
|
||||||
|
|
||||||
shutil.rmtree("test_i", ignore_errors=True)
|
shutil.rmtree("i", ignore_errors=True)
|
||||||
sist2("scan", path, "-o", "test_i", *args)
|
sist2("scan", path, "-o", "i", *args)
|
||||||
return iter(sist2_index_to_dict("test_i"))
|
return iter(sist2_index_to_dict("i"))
|
||||||
|
|
||||||
|
|
||||||
def sist2_incremental_index(files, func=None, *args):
|
def sist2_incremental_index(files, func=None, *args):
|
||||||
@ -38,14 +36,14 @@ def sist2_incremental_index(files, func=None, *args):
|
|||||||
if func:
|
if func:
|
||||||
func(path)
|
func(path)
|
||||||
|
|
||||||
shutil.rmtree("test_i_inc", ignore_errors=True)
|
shutil.rmtree("i_inc", ignore_errors=True)
|
||||||
sist2("scan", path, "-o", "test_i_inc", "--incremental", "test_i", *args)
|
sist2("scan", path, "-o", "i_inc", "--incremental", "i", *args)
|
||||||
return iter(sist2_index_to_dict("test_i_inc"))
|
return iter(sist2_index_to_dict("i_inc"))
|
||||||
|
|
||||||
|
|
||||||
def sist2_index_to_dict(index):
|
def sist2_index_to_dict(index):
|
||||||
res = subprocess.check_output(
|
res = subprocess.check_output(
|
||||||
args=["./sist2", "index", "--print", index],
|
args=["./sist2_debug", "index", "--print", index],
|
||||||
)
|
)
|
||||||
|
|
||||||
for line in res.splitlines():
|
for line in res.splitlines():
|
||||||
|
2
third-party/libscan
vendored
2
third-party/libscan
vendored
@ -1 +1 @@
|
|||||||
Subproject commit 9be4f02851107edac65894a1fdde16a80cad43ac
|
Subproject commit 598e748214fe0656d536e40bb9e056c058504d85
|
Loading…
x
Reference in New Issue
Block a user