It compiles! (I think)

This commit is contained in:
simon987 2020-04-04 11:13:52 -04:00
parent 035fa26dc4
commit 647fd70d8c
10 changed files with 90 additions and 160 deletions

View File

@ -113,18 +113,16 @@ sist2 scan --ocr eng ~/Books/Textbooks/
## Build from source ## Build from source
You can compile **sist2** by yourself if you don't want to use the pre-compiled You can compile **sist2** by yourself if you don't want to use the pre-compiled
binaries. binaries (GCC 7+ required).
1. Install compile-time dependencies 1. Install compile-time dependencies
```bash ```bash
vcpkg install lmdb cjson glib vcpkg install lmdb cjson glib libarchive[core,bzip2,libxml2,lz4,lzma,lzo] pthread tesseract libxml2 ffmpeg zstd
``` ```
2. Build 2. Build
```bash ```bash
git clone --recurse-submodules https://github.com/simon987/sist2 cmake -D <VCPKG_ROOT>/scripts/buildsystems/vcpkg.cmake .
./scripts/get_static_libs.sh
cmake .
make make
``` ```

View File

@ -1,12 +1,11 @@
#!/usr/bin/env bash #!/usr/bin/env bash
./scripts/get_static_libs.sh
rm -rf CMakeFiles CmakeCache.txt rm -rf CMakeFiles CmakeCache.txt
cmake -DSIST_DEBUG=off . cmake -DSIST_DEBUG=off -DCMAKE_TOOLCHAIN_FILE=/vcpkg/scripts/buildsystems/vcpkg.cmake .
make make
strip sist2 strip sist2
rm -rf CMakeFiles CmakeCache.txt rm -rf CMakeFiles CmakeCache.txt
cmake -DSIST_DEBUG=on . cmake -DSIST_DEBUG=on -DCMAKE_TOOLCHAIN_FILE=/vcpkg/scripts/buildsystems/vcpkg.cmake .
make make

View File

@ -3,7 +3,7 @@
#include "sist.h" #include "sist.h"
#include "libscan/scan.h" #include "libscan/arc/arc.h"
typedef struct scan_args { typedef struct scan_args {
float quality; float quality;

View File

@ -4,6 +4,13 @@
#include "sist.h" #include "sist.h"
#include "tpool.h" #include "tpool.h"
#include "libscan/scan.h" #include "libscan/scan.h"
#include "libscan/arc/arc.h"
#include "libscan/cbr/cbr.h"
#include "libscan/ebook/ebook.h"
#include "libscan/font/font.h"
#include "libscan/media/media.h"
#include "libscan/ooxml/ooxml.h"
#include "libscan/text/text.h"
#include <glib.h> #include <glib.h>
#include <pcre.h> #include <pcre.h>
@ -17,12 +24,8 @@ struct {
tpool_t *pool; tpool_t *pool;
int tn_size;
int threads; int threads;
int content_size;
float tn_qscale;
int depth; int depth;
archive_mode_t archive_mode;
int verbose; int verbose;
int very_verbose; int very_verbose;
@ -32,28 +35,30 @@ struct {
GHashTable *original_table; GHashTable *original_table;
GHashTable *copy_table; GHashTable *copy_table;
pthread_mutex_t mupdf_mu;
char * tesseract_lang;
const char * tesseract_path;
pcre *exclude; pcre *exclude;
pcre_extra *exclude_extra; pcre_extra *exclude_extra;
int fast; int fast;
scan_arc_ctx_t arc_ctx;
scan_cbr_ctx_t cbr_ctx;
scan_ebook_ctx_t ebook_ctx;
scan_font_ctx_t font_ctx;
scan_media_ctx_t media_ctx;
scan_ooxml_ctx_t ooxml_ctx;
scan_text_ctx_t text_ctx;
} ScanCtx; } ScanCtx;
//TODO Move to log.h
struct { struct {
int verbose; int verbose;
int very_verbose; int very_verbose;
int no_color; int no_color;
} LogCtx; } LogCtx;
//TODO Move to index.h ?
struct { struct {
char *es_url; char *es_url;
int batch_size; int batch_size;
} IndexCtx; } IndexCtx;
//TODO Move to serve.h ?
struct { struct {
char *es_url; char *es_url;
int index_count; int index_count;

View File

@ -11,8 +11,6 @@ typedef struct store_t {
pthread_rwlock_t lock; pthread_rwlock_t lock;
} store_t; } store_t;
#include "../sist.h"
store_t *store_create(char *path); store_t *store_create(char *path);
void store_destroy(store_t *store); void store_destroy(store_t *store);

View File

@ -3,8 +3,6 @@
#define _XOPEN_SOURCE 500 #define _XOPEN_SOURCE 500
#include "../sist.h"
int walk_directory_tree(const char *); int walk_directory_tree(const char *);
#endif #endif

View File

@ -2,7 +2,6 @@
#include "ctx.h" #include "ctx.h"
#include <third-party/argparse/argparse.h> #include <third-party/argparse/argparse.h>
#include <uuid/uuid.h>
#include <glib.h> #include <glib.h>
#include "cli.h" #include "cli.h"
@ -10,10 +9,10 @@
#include "io/store.h" #include "io/store.h"
#include "tpool.h" #include "tpool.h"
#include "io/walk.h" #include "io/walk.h"
#include "io/walk.h"
#include "index/elastic.h" #include "index/elastic.h"
#include "web/serve.h" #include "web/serve.h"
#include "parsing/mime.h" #include "parsing/mime.h"
#include "parsing/parse.h"
#define DESCRIPTION "Lightning-fast file system indexer and search tool." #define DESCRIPTION "Lightning-fast file system indexer and search tool."
@ -28,11 +27,6 @@ static const char *const usage[] = {
NULL, NULL,
}; };
void global_init() {
//TODO
// curl_global_init(CURL_GLOBAL_NOTHING);
}
void init_dir(const char *dirpath) { void init_dir(const char *dirpath) {
char path[PATH_MAX]; char path[PATH_MAX];
snprintf(path, PATH_MAX, "%sdescriptor.json", dirpath); snprintf(path, PATH_MAX, "%sdescriptor.json", dirpath);
@ -51,28 +45,74 @@ void scan_print_header() {
LOG_INFOF("main.c", "sist2 v%s", Version) LOG_INFOF("main.c", "sist2 v%s", Version)
} }
void sist2_scan(scan_args_t *args) { void _store(char *key, size_t key_len, char *buf, size_t buf_len) {
store_write(ScanCtx.index.store, key, key_len, buf, buf_len);
}
void initialize_scan_context(scan_args_t *args) {
// Arc
ScanCtx.arc_ctx.mode = args->archive_mode;
ScanCtx.arc_ctx.log = sist_log;
ScanCtx.arc_ctx.logf = sist_logf;
ScanCtx.arc_ctx.parse = (parse_callback_t) parse;
// Cbr
ScanCtx.cbr_ctx.log = sist_log;
ScanCtx.cbr_ctx.logf = sist_logf;
ScanCtx.cbr_ctx.store = _store;
ScanCtx.cbr_ctx.cbr_mime = mime_get_mime_by_string(ScanCtx.mime_table, "application/x-cbr");
// Ebook
pthread_mutex_init(&ScanCtx.ebook_ctx.mupdf_mutex, NULL);
ScanCtx.ebook_ctx.content_size = args->content_size;
ScanCtx.ebook_ctx.tn_size = args->size;
ScanCtx.ebook_ctx.tesseract_lang = args->tesseract_lang;
ScanCtx.ebook_ctx.tesseract_path = args->tesseract_path;
ScanCtx.ebook_ctx.log = sist_log;
ScanCtx.ebook_ctx.logf = sist_logf;
ScanCtx.ebook_ctx.store = _store;
// Font
ScanCtx.font_ctx.enable_tn = args->size > 0;
ScanCtx.font_ctx.log = sist_log;
ScanCtx.font_ctx.logf = sist_logf;
ScanCtx.font_ctx.store = _store;
// Media
ScanCtx.media_ctx.tn_qscale = args->quality;
ScanCtx.media_ctx.tn_size = args->size;
ScanCtx.media_ctx.content_size = args->content_size;
ScanCtx.media_ctx.log = sist_log;
ScanCtx.media_ctx.logf = sist_logf;
ScanCtx.media_ctx.store = _store;
// OOXML
ScanCtx.ooxml_ctx.content_size = args->content_size;
ScanCtx.ooxml_ctx.log = sist_log;
ScanCtx.ooxml_ctx.logf = sist_logf;
ScanCtx.tn_qscale = args->quality;
ScanCtx.tn_size = args->size;
ScanCtx.content_size = args->content_size;
ScanCtx.threads = args->threads; ScanCtx.threads = args->threads;
ScanCtx.depth = args->depth; ScanCtx.depth = args->depth;
ScanCtx.archive_mode = args->archive_mode;
strncpy(ScanCtx.index.path, args->output, sizeof(ScanCtx.index.path)); strncpy(ScanCtx.index.path, args->output, sizeof(ScanCtx.index.path));
strncpy(ScanCtx.index.desc.name, args->name, sizeof(ScanCtx.index.desc.name)); strncpy(ScanCtx.index.desc.name, args->name, sizeof(ScanCtx.index.desc.name));
strncpy(ScanCtx.index.desc.root, args->path, sizeof(ScanCtx.index.desc.root)); strncpy(ScanCtx.index.desc.root, args->path, sizeof(ScanCtx.index.desc.root));
strncpy(ScanCtx.index.desc.rewrite_url, args->rewrite_url, sizeof(ScanCtx.index.desc.rewrite_url)); strncpy(ScanCtx.index.desc.rewrite_url, args->rewrite_url, sizeof(ScanCtx.index.desc.rewrite_url));
ScanCtx.index.desc.root_len = (short) strlen(ScanCtx.index.desc.root); ScanCtx.index.desc.root_len = (short) strlen(ScanCtx.index.desc.root);
ScanCtx.tesseract_lang = args->tesseract_lang;
ScanCtx.tesseract_path = args->tesseract_path;
ScanCtx.fast = args->fast; ScanCtx.fast = args->fast;
}
init_dir(ScanCtx.index.path);
void sist2_scan(scan_args_t *args) {
ScanCtx.mime_table = mime_get_mime_table(); ScanCtx.mime_table = mime_get_mime_table();
ScanCtx.ext_table = mime_get_ext_table(); ScanCtx.ext_table = mime_get_ext_table();
initialize_scan_context(args);
init_dir(ScanCtx.index.path);
char store_path[PATH_MAX]; char store_path[PATH_MAX];
snprintf(store_path, PATH_MAX, "%sthumbs", ScanCtx.index.path); snprintf(store_path, PATH_MAX, "%sthumbs", ScanCtx.index.path);
mkdir(store_path, S_IWUSR | S_IRUSR | S_IXUSR); mkdir(store_path, S_IWUSR | S_IRUSR | S_IXUSR);
@ -222,8 +262,6 @@ void sist2_web(web_args_t *args) {
int main(int argc, const char *argv[]) { int main(int argc, const char *argv[]) {
global_init();
scan_args_t *scan_args = scan_args_create(); scan_args_t *scan_args = scan_args_create();
index_args_t *index_args = index_args_create(); index_args_t *index_args = index_args_create();
web_args_t *web_args = web_args_create(); web_args_t *web_args = web_args_create();

View File

@ -11,6 +11,9 @@
__thread magic_t Magic = NULL; __thread magic_t Magic = NULL;
#define MIN_VIDEO_SIZE 1024 * 64
#define MIN_IMAGE_SIZE 1024 * 2
int fs_read(struct vfile *f, void *buf, size_t size) { int fs_read(struct vfile *f, void *buf, size_t size) {
if (f->fd == -1) { if (f->fd == -1) {
@ -112,34 +115,28 @@ void parse(void *arg) {
} else if ((mmime == MimeVideo && doc.size >= MIN_VIDEO_SIZE) || } else if ((mmime == MimeVideo && doc.size >= MIN_VIDEO_SIZE) ||
(mmime == MimeImage && doc.size >= MIN_IMAGE_SIZE) || mmime == MimeAudio) { (mmime == MimeImage && doc.size >= MIN_IMAGE_SIZE) || mmime == MimeAudio) {
scan_media_ctx_t media_ctx; parse_media(&ScanCtx.media_ctx, &job->vfile, &doc);
media_ctx.tn_qscale = ScanCtx.tn_qscale;
media_ctx.tn_size = ScanCtx.tn_size;
media_ctx.content_size = ScanCtx.content_size;
parse_media(&media_ctx, &job->vfile, &doc);
} else if (IS_PDF(doc.mime)) { } else if (IS_PDF(doc.mime)) {
// parse_ebook(pdf_buf, doc.size, &doc); parse_ebook(&ScanCtx.ebook_ctx, &job->vfile, mime_get_mime_text(doc.mime), &doc);
} else if (mmime == MimeText && ScanCtx.content_size > 0) { } else if (mmime == MimeText && ScanCtx.text_ctx.content_size > 0) {
// parse_text(bytes_read, &job->vfile, (char *) buf, &doc); parse_text(&ScanCtx.text_ctx, &job->vfile, &doc);
} else if (IS_FONT(doc.mime)) { } else if (IS_FONT(doc.mime)) {
// parse_font(font_buf, doc.size, &doc); parse_font(&ScanCtx.font_ctx, &job->vfile, &doc);
} else if ( } else if (
ScanCtx.archive_mode != ARC_MODE_SKIP && ( ScanCtx.arc_ctx.mode != ARC_MODE_SKIP && (
IS_ARC(doc.mime) || IS_ARC(doc.mime) ||
(IS_ARC_FILTER(doc.mime) && should_parse_filtered_file(doc.filepath, doc.ext)) (IS_ARC_FILTER(doc.mime) && should_parse_filtered_file(doc.filepath, doc.ext))
)) { )) {
// parse_archive(&job->vfile, &doc); parse_archive(&ScanCtx.arc_ctx, &job->vfile, &doc);
} else if (ScanCtx.content_size > 0 && IS_DOC(doc.mime)) { } else if (ScanCtx.ooxml_ctx.content_size > 0 && IS_DOC(doc.mime)) {
// parse_doc(doc_buf, doc.size, &doc); parse_ooxml(&ScanCtx.ooxml_ctx, &job->vfile, &doc);
} else if (is_cbr(doc.mime)) {
// parse_cbr(cbr_buf, doc.size, &doc);
} else if (is_cbr(&ScanCtx.cbr_ctx, doc.mime)) {
parse_cbr(&ScanCtx.cbr_ctx, &job->vfile, &doc);
} }
//Parent meta //Parent meta

View File

@ -11,7 +11,6 @@
#include "libscan/scan.h" #include "libscan/scan.h"
char *abspath(const char *path); char *abspath(const char *path);
char *expandpath(const char *path); char *expandpath(const char *path);

View File

@ -157,108 +157,6 @@ void thumbnail(struct mg_connection *nc, struct http_message *hm, struct mg_str
nc->flags |= MG_F_SEND_AND_CLOSE; nc->flags |= MG_F_SEND_AND_CLOSE;
} }
/**
* Modified version of onion_shortcut_response_file that allows
* browsers to seek media files.
*/
int chunked_response_file(const char *filename, const char *mime,
int partial, onion_request *request, onion_response *res) {
int fd = open(filename, O_RDONLY | O_CLOEXEC);
struct stat st;
if (fd < 0 || stat(filename, &st) != 0 || S_ISDIR(st.st_mode)) {
close(fd);
return OCS_NOT_PROCESSED;
}
size_t length = st.st_size;
size_t ends;
const char *range = onion_request_get_header(request, "Range");
if (partial && range && strncmp(range, "bytes=", 6) == 0) {
onion_response_set_header(res, "Accept-Ranges", "bytes");
onion_response_set_code(res, HTTP_PARTIAL_CONTENT);
char tmp[1024];
if (strlen(range + 6) >= sizeof(tmp)) {
close(fd);
return OCS_INTERNAL_ERROR;
}
strncpy(tmp, range + 6, sizeof(tmp) - 1);
char *start = tmp;
char *end = tmp;
while (*end != '-' && *end) {
end++;
}
if (*end == '-') {
*end = '\0';
end++;
size_t starts;
starts = atol(start);
if (*end) {
// %d-%d
ends = atol(end);
} else {
// %d-
ends = MIN(starts + CHUNK_SIZE, length);
}
if (ends > length || starts >= length || starts < 0) {
close(fd);
return OCS_INTERNAL_ERROR;
}
length = ends - starts;
if (starts != 0) {
lseek(fd, starts, SEEK_SET);
}
snprintf(tmp, sizeof(tmp), "bytes %ld-%ld/%ld",
starts, ends - 1, st.st_size);
onion_response_set_header(res, "Content-Range", tmp);
}
}
onion_response_set_length(res, length);
if (mime != NULL) {
onion_response_set_header(res, "Content-Type", mime);
} else {
onion_response_set_header(res, "Content-Type", "application/octet-stream");
}
onion_response_write_headers(res);
if ((onion_request_get_flags(request) & OR_HEAD) == OR_HEAD) {
length = 0;
}
if (length) {
int bytes_read = 0, bytes_written;
size_t total_read = 0;
char buf[4046];
if (length > sizeof(buf)) {
size_t max = length - sizeof(buf);
while (total_read < max) {
bytes_read = read(fd, buf, sizeof(buf));
if (bytes_read < 0) {
break;
}
total_read += bytes_read;
bytes_written = onion_response_write(res, buf, bytes_read);
if (bytes_written != bytes_read) {
break;
}
}
}
if (sizeof(buf) >= (length - total_read)) {
bytes_read = read(fd, buf, length - total_read);
onion_response_write(res, buf, bytes_read);
}
}
close(fd);
return OCS_PROCESSED;
}
void search(struct mg_connection *nc, struct http_message *hm) { void search(struct mg_connection *nc, struct http_message *hm) {
if (hm->body.len == 0) { if (hm->body.len == 0) {