mirror of
https://github.com/simon987/sist2.git
synced 2025-04-24 12:45:56 +00:00
unscramble submodules
This commit is contained in:
parent
a0a18c79e3
commit
8a7635359a
3
.gitmodules
vendored
3
.gitmodules
vendored
@ -1,9 +1,6 @@
|
||||
[submodule "third-party/libscan"]
|
||||
path = third-party/libscan
|
||||
url = https://github.com/simon987/libscan
|
||||
[submodule "third-party/utf8.h"]
|
||||
path = third-party/utf8.h
|
||||
url = https://github.com/sheredom/utf8.h
|
||||
[submodule "third-party/argparse"]
|
||||
path = third-party/argparse
|
||||
url = https://github.com/cofyc/argparse
|
||||
|
1
third-party/libscan
vendored
Submodule
1
third-party/libscan
vendored
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit 5d39dc675849ecb99d5308b02f7e1fd20ca5b410
|
10
third-party/libscan/.gitignore
vendored
10
third-party/libscan/.gitignore
vendored
@ -1,10 +0,0 @@
|
||||
.idea/
|
||||
cmake_install.cmake
|
||||
Makefile
|
||||
libscan.a
|
||||
libscan.so
|
||||
*.cbp
|
||||
CMakeFiles
|
||||
CMakeCache.txt
|
||||
scan_test
|
||||
third-party/
|
12
third-party/libscan/.gitmodules
vendored
12
third-party/libscan/.gitmodules
vendored
@ -1,12 +0,0 @@
|
||||
[submodule "third-party/uuid"]
|
||||
path = third-party/uuid
|
||||
url = https://github.com/certik/uuid
|
||||
[submodule "third-party/utf8.h"]
|
||||
path = third-party/utf8.h
|
||||
url = https://github.com/sheredom/utf8.h
|
||||
[submodule "third-party/libarchive"]
|
||||
path = third-party/libarchive
|
||||
url = https://github.com/libarchive/libarchive
|
||||
[submodule "third-party/zlib"]
|
||||
path = third-party/zlib
|
||||
url = https://github.com/madler/zlib
|
124
third-party/libscan/CMakeLists.txt
vendored
124
third-party/libscan/CMakeLists.txt
vendored
@ -1,124 +0,0 @@
|
||||
cmake_minimum_required(VERSION 3.15)
|
||||
|
||||
project(scan C)
|
||||
set(CMAKE_C_STANDARD 11)
|
||||
|
||||
add_library(
|
||||
scan
|
||||
libscan/util.c libscan/util.h
|
||||
libscan/scan.h
|
||||
libscan/macros.h
|
||||
|
||||
libscan/text/text.c libscan/text/text.h
|
||||
libscan/arc/arc.c libscan/arc/arc.h
|
||||
libscan/ebook/ebook.c libscan/ebook/ebook.h
|
||||
libscan/cbr/cbr.c libscan/cbr/cbr.h
|
||||
libscan/ooxml/ooxml.c libscan/ooxml/ooxml.h
|
||||
libscan/media/media.c libscan/media/media.h
|
||||
libscan/font/font.c libscan/font/font.h
|
||||
|
||||
third-party/utf8.h
|
||||
)
|
||||
|
||||
set(CMAKE_FIND_LIBRARY_SUFFIXES .a .lib)
|
||||
target_link_directories(scan PRIVATE BEFORE /usr/share/vcpkg/installed/x64-linux/lib/)
|
||||
|
||||
find_package(LibArchive REQUIRED)
|
||||
find_package(BZip2 REQUIRED)
|
||||
find_package(lz4 REQUIRED)
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
find_package(Tesseract CONFIG REQUIRED)
|
||||
find_package(harfbuzz CONFIG REQUIRED)
|
||||
find_package(OpenJPEG CONFIG REQUIRED)
|
||||
find_package(JPEG REQUIRED)
|
||||
find_package(LibXml2 REQUIRED)
|
||||
find_package(FFMPEG REQUIRED)
|
||||
#find_package(OpenSSL REQUIRED)
|
||||
find_package(LibLZMA REQUIRED)
|
||||
find_package(ZLIB REQUIRED)
|
||||
|
||||
|
||||
include(ExternalProject)
|
||||
find_program(MAKE_EXE NAMES gmake nmake make)
|
||||
ExternalProject_Add(
|
||||
mupdf
|
||||
# TODO: use master branch ?
|
||||
URL https://mupdf.com/downloads/archive/mupdf-1.16.1-source.tar.xz
|
||||
|
||||
UPDATE_COMMAND ""
|
||||
PATCH_COMMAND ""
|
||||
TEST_COMMAND ""
|
||||
CONFIGURE_COMMAND ""
|
||||
INSTALL_COMMAND ""
|
||||
|
||||
PREFIX "third-party/ext_mupdf"
|
||||
BINARY_DIR "third-party/ext_mupdf/src/mupdf"
|
||||
|
||||
BUILD_COMMAND CFLAGS=-fPIC HAVE_CURL=no HAVE_GLUT=no ${MAKE_EXE} -j 4 --silent
|
||||
&& ar d build/release/libmupdf-third.a jutils.o jdinput.o jdmarker.o jdmaster.o
|
||||
)
|
||||
SET(MUPDF_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_mupdf/src/mupdf/build/release/)
|
||||
SET(MUPDF_INC_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_mupdf/src/mupdf/include/)
|
||||
|
||||
|
||||
target_compile_options(
|
||||
scan
|
||||
PRIVATE
|
||||
-Werror
|
||||
-g
|
||||
)
|
||||
|
||||
add_dependencies(
|
||||
scan
|
||||
mupdf
|
||||
)
|
||||
|
||||
SET(CMAKE_C_LINK_EXECUTABLE "g++ <FLAGS> <CMAKE_C_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
|
||||
|
||||
string(REGEX REPLACE "-lvdpau" "" FFMPEG_LIBRARIES "${FFMPEG_LIBRARIES}")
|
||||
string(REGEX REPLACE "-lX11" "" FFMPEG_LIBRARIES "${FFMPEG_LIBRARIES}")
|
||||
|
||||
target_link_libraries(
|
||||
scan
|
||||
|
||||
${LibArchive_LIBRARIES}
|
||||
ZLIB::ZLIB
|
||||
BZip2::BZip2
|
||||
lz4::lz4
|
||||
zstd
|
||||
lzo2
|
||||
LibLZMA::LibLZMA
|
||||
|
||||
freetype
|
||||
|
||||
# OpenSSL::SSL OpenSSL::Crypto
|
||||
|
||||
stdc++
|
||||
|
||||
-Wl,--whole-archive
|
||||
m
|
||||
-Wl,--no-whole-archive
|
||||
|
||||
"${MUPDF_LIB_DIR}/libmupdf.a"
|
||||
"${MUPDF_LIB_DIR}/libmupdf-third.a"
|
||||
|
||||
${JPEG_LIBRARIES}
|
||||
${Tesseract_LIBRARIES}
|
||||
${LIBXML2_LIBRARIES}
|
||||
${FFMPEG_LIBRARIES}
|
||||
|
||||
${CMAKE_THREAD_LIBS_INIT}
|
||||
|
||||
uuid
|
||||
)
|
||||
|
||||
target_include_directories(
|
||||
scan
|
||||
BEFORE
|
||||
PUBLIC
|
||||
${MUPDF_INC_DIR}
|
||||
${JPEG_INCLUDE_DIR}
|
||||
${LIBXML2_INCLUDE_DIR}
|
||||
${FFMPEG_INCLUDE_DIR}
|
||||
)
|
7
third-party/libscan/README.md
vendored
7
third-party/libscan/README.md
vendored
@ -1,7 +0,0 @@
|
||||
|
||||
```bash
|
||||
vcpkg install libarchive pthread tesseract libxml2 ffmpeg
|
||||
|
||||
cmake -DCMAKE_TOOLCHAIN_FILE=/usr/share/vcpkg/scripts/buildsystems/vcpkg.cmake .
|
||||
make -j 4
|
||||
```
|
8
third-party/libscan/build.sh
vendored
8
third-party/libscan/build.sh
vendored
@ -1,8 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
export CC=gcc
|
||||
export CXX=g++
|
||||
|
||||
rm -rf CMakeFiles CMakeCache.txt
|
||||
cmake -DCMAKE_TOOLCHAIN_FILE=/usr/share/vcpkg/scripts/buildsystems/vcpkg.cmake . || exit
|
||||
make -j 4
|
167
third-party/libscan/libscan/arc/arc.c
vendored
167
third-party/libscan/libscan/arc/arc.c
vendored
@ -1,167 +0,0 @@
|
||||
#include "arc.h"
|
||||
|
||||
#include "../scan.h"
|
||||
#include "../util.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
|
||||
|
||||
int should_parse_filtered_file(const char *filepath, int ext) {
|
||||
char tmp[PATH_MAX * 2];
|
||||
|
||||
if (ext == 0) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
memcpy(tmp, filepath, ext - 1);
|
||||
*(tmp + ext - 1) = '\0';
|
||||
|
||||
char *idx = strrchr(tmp, '.');
|
||||
|
||||
if (idx == NULL) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
if (strcmp(idx, ".tar") == 0) {
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
int arc_read(struct vfile *f, void *buf, size_t size) {
|
||||
return archive_read_data(f->arc, buf, size);
|
||||
}
|
||||
|
||||
typedef struct arc_data {
|
||||
vfile_t *f;
|
||||
char buf[ARC_BUF_SIZE];
|
||||
} arc_data_f;
|
||||
|
||||
int vfile_open_callback(struct archive *a, void *user_data) {
|
||||
arc_data_f *data = user_data;
|
||||
|
||||
if (data->f->is_fs_file && data->f->fd == -1) {
|
||||
data->f->fd = open(data->f->filepath, O_RDONLY);
|
||||
}
|
||||
|
||||
return ARCHIVE_OK;
|
||||
}
|
||||
|
||||
long vfile_read_callback(struct archive *a, void *user_data, const void **buf) {
|
||||
arc_data_f *data = user_data;
|
||||
|
||||
*buf = data->buf;
|
||||
return data->f->read(data->f, data->buf, ARC_BUF_SIZE);
|
||||
}
|
||||
|
||||
int vfile_close_callback(struct archive *a, void *user_data) {
|
||||
arc_data_f *data = user_data;
|
||||
|
||||
if (data->f->close != NULL) {
|
||||
data->f->close(data->f);
|
||||
}
|
||||
|
||||
return ARCHIVE_OK;
|
||||
}
|
||||
|
||||
scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
|
||||
struct archive *a;
|
||||
struct archive_entry *entry;
|
||||
|
||||
|
||||
arc_data_f data;
|
||||
data.f = f;
|
||||
|
||||
int ret = 0;
|
||||
if (data.f->is_fs_file) {
|
||||
|
||||
a = archive_read_new();
|
||||
archive_read_support_filter_all(a);
|
||||
archive_read_support_format_all(a);
|
||||
|
||||
ret = archive_read_open_filename(a, doc->filepath, ARC_BUF_SIZE);
|
||||
} else if (ctx->mode == ARC_MODE_RECURSE) {
|
||||
|
||||
a = archive_read_new();
|
||||
archive_read_support_filter_all(a);
|
||||
archive_read_support_format_all(a);
|
||||
|
||||
ret = archive_read_open(
|
||||
a, &data,
|
||||
vfile_open_callback,
|
||||
vfile_read_callback,
|
||||
vfile_close_callback
|
||||
);
|
||||
} else {
|
||||
return SCAN_OK;
|
||||
}
|
||||
|
||||
if (ret != ARCHIVE_OK) {
|
||||
//TODO: log
|
||||
// LOG_ERRORF(doc->filepath, "(arc.c) [%d] %s", ret, archive_error_string(a))
|
||||
archive_read_free(a);
|
||||
return SCAN_ERR_READ;
|
||||
}
|
||||
|
||||
if (ctx->mode == ARC_MODE_LIST) {
|
||||
|
||||
dyn_buffer_t buf = dyn_buffer_create();
|
||||
|
||||
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
|
||||
if (S_ISREG(archive_entry_stat(entry)->st_mode)) {
|
||||
|
||||
char *path = (char *) archive_entry_pathname(entry);
|
||||
|
||||
dyn_buffer_append_string(&buf, path);
|
||||
dyn_buffer_write_char(&buf, '\n');
|
||||
}
|
||||
}
|
||||
dyn_buffer_write_char(&buf, '\0');
|
||||
|
||||
meta_line_t *meta_list = malloc(sizeof(meta_line_t) + buf.cur);
|
||||
meta_list->key = MetaContent;
|
||||
strcpy(meta_list->str_val, buf.buf);
|
||||
APPEND_META(doc, meta_list);
|
||||
dyn_buffer_destroy(&buf);
|
||||
|
||||
} else {
|
||||
|
||||
parse_job_t *sub_job = malloc(sizeof(parse_job_t) + PATH_MAX * 2);
|
||||
|
||||
sub_job->vfile.close = NULL;
|
||||
sub_job->vfile.read = arc_read;
|
||||
sub_job->vfile.arc = a;
|
||||
sub_job->vfile.filepath = sub_job->filepath;
|
||||
sub_job->vfile.is_fs_file = FALSE;
|
||||
memcpy(sub_job->parent, doc->uuid, sizeof(uuid_t));
|
||||
|
||||
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
|
||||
sub_job->info = *archive_entry_stat(entry);
|
||||
if (S_ISREG(sub_job->info.st_mode)) {
|
||||
sprintf(sub_job->filepath, "%s#/%s", f->filepath, archive_entry_pathname(entry));
|
||||
sub_job->base = (int) (strrchr(sub_job->filepath, '/') - sub_job->filepath) + 1;
|
||||
|
||||
char *p = strrchr(sub_job->filepath, '.');
|
||||
if (p != NULL) {
|
||||
sub_job->ext = (int) (p - sub_job->filepath + 1);
|
||||
} else {
|
||||
sub_job->ext = (int) strlen(sub_job->filepath);
|
||||
}
|
||||
|
||||
//TODO:
|
||||
// parse(sub_job);
|
||||
}
|
||||
}
|
||||
|
||||
free(sub_job);
|
||||
}
|
||||
|
||||
archive_read_free(a);
|
||||
return SCAN_OK;
|
||||
}
|
26
third-party/libscan/libscan/arc/arc.h
vendored
26
third-party/libscan/libscan/arc/arc.h
vendored
@ -1,26 +0,0 @@
|
||||
#ifndef SCAN_ARC_H
|
||||
#define SCAN_ARC_H
|
||||
|
||||
#include <archive.h>
|
||||
#include <archive_entry.h>
|
||||
#include "../scan.h"
|
||||
|
||||
#define ARC_MODE_SKIP 0
|
||||
#define ARC_MODE_LIST 1
|
||||
#define ARC_MODE_SHALLOW 2
|
||||
#define ARC_MODE_RECURSE 3
|
||||
typedef int archive_mode_t;
|
||||
|
||||
typedef struct {
|
||||
archive_mode_t mode;
|
||||
} scan_arc_ctx_t;
|
||||
|
||||
#define ARC_BUF_SIZE 8192
|
||||
|
||||
int should_parse_filtered_file(const char *filepath, int ext);
|
||||
|
||||
scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc);
|
||||
|
||||
int arc_read(struct vfile * f, void *buf, size_t size);
|
||||
|
||||
#endif
|
65
third-party/libscan/libscan/cbr/cbr.c
vendored
65
third-party/libscan/libscan/cbr/cbr.c
vendored
@ -1,65 +0,0 @@
|
||||
#include "cbr.h"
|
||||
#include "../scan.h"
|
||||
#include "../util.h"
|
||||
#include "../arc/arc.h"
|
||||
#include "../ebook/ebook.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <archive.h>
|
||||
|
||||
unsigned int cbr_mime;
|
||||
unsigned int cbz_mime;
|
||||
|
||||
void cbr_init() {
|
||||
//TODO: get mime str
|
||||
// cbr_mime = mime_get_mime_by_string(ScanCtx.mime_table, "application/x-cbr");
|
||||
// cbz_mime = mime_get_mime_by_string(ScanCtx.mime_table, "application/x-cbz");
|
||||
}
|
||||
|
||||
int is_cbr(unsigned int mime) {
|
||||
return mime == cbr_mime;
|
||||
}
|
||||
|
||||
void parse_cbr(scan_cbr_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
|
||||
size_t buf_len;
|
||||
void *buf = read_all(f, &buf_len);
|
||||
|
||||
char *out_buf = malloc(buf_len * 2); // TODO: we probably only need 1.2x or 1.5x, even better would be a dynamic buffer
|
||||
size_t out_buf_used = 0;
|
||||
|
||||
struct archive *rar_in = archive_read_new();
|
||||
archive_read_support_filter_none(rar_in);
|
||||
archive_read_support_format_rar(rar_in);
|
||||
|
||||
archive_read_open_memory(rar_in, buf, buf_len);
|
||||
|
||||
struct archive *zip_out = archive_write_new();
|
||||
archive_write_set_format_zip(zip_out);
|
||||
archive_write_open_memory(zip_out, out_buf, buf_len * 2, &out_buf_used);
|
||||
|
||||
struct archive_entry *entry;
|
||||
while (archive_read_next_header(rar_in, &entry) == ARCHIVE_OK) {
|
||||
archive_write_header(zip_out, entry);
|
||||
|
||||
char arc_buf[ARC_BUF_SIZE];
|
||||
int len = archive_read_data(rar_in, arc_buf, ARC_BUF_SIZE);
|
||||
while (len > 0) {
|
||||
archive_write_data(zip_out, arc_buf, len);
|
||||
len = archive_read_data(rar_in, arc_buf, ARC_BUF_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
archive_write_close(zip_out);
|
||||
archive_write_free(zip_out);
|
||||
|
||||
archive_read_close(rar_in);
|
||||
archive_read_free(rar_in);
|
||||
|
||||
doc->mime = cbz_mime;
|
||||
|
||||
//TODO: get mime string
|
||||
// parse_ebook(out_buf, out_buf_used, doc);
|
||||
doc->mime = cbr_mime;
|
||||
free(out_buf);
|
||||
}
|
17
third-party/libscan/libscan/cbr/cbr.h
vendored
17
third-party/libscan/libscan/cbr/cbr.h
vendored
@ -1,17 +0,0 @@
|
||||
#ifndef SCAN_CBR_H
|
||||
#define SCAN_CBR_H
|
||||
|
||||
#include <stdlib.h>
|
||||
#include "../scan.h"
|
||||
|
||||
typedef struct {
|
||||
|
||||
} scan_cbr_ctx_t;
|
||||
|
||||
void cbr_init();
|
||||
|
||||
int is_cbr(unsigned int mime);
|
||||
|
||||
void parse_cbr(scan_cbr_ctx_t *ctx, vfile_t *f, document_t *doc);
|
||||
|
||||
#endif
|
334
third-party/libscan/libscan/ebook/ebook.c
vendored
334
third-party/libscan/libscan/ebook/ebook.c
vendored
@ -1,334 +0,0 @@
|
||||
#include "ebook.h"
|
||||
#include "../util.h"
|
||||
#include <mupdf/fitz.h>
|
||||
#include <pthread.h>
|
||||
#include <tesseract/capi.h>
|
||||
|
||||
#define MIN_OCR_SIZE 350
|
||||
#define MIN_OCR_LEN 10
|
||||
|
||||
/* fill_image callback doesn't let us pass opaque pointers unless I create my own device */
|
||||
__thread text_buffer_t thread_buffer;
|
||||
__thread scan_ebook_ctx_t thread_ctx;
|
||||
|
||||
|
||||
int render_cover(scan_ebook_ctx_t *ctx, fz_context *fzctx, document_t *doc, fz_document *fzdoc) {
|
||||
|
||||
int err = 0;
|
||||
fz_page *cover = NULL;
|
||||
|
||||
fz_var(cover);
|
||||
fz_var(err);
|
||||
fz_try(fzctx)
|
||||
cover = fz_load_page(fzctx, fzdoc, 0);
|
||||
fz_catch(fzctx)
|
||||
err = 1;
|
||||
|
||||
if (err != 0) {
|
||||
fz_drop_page(fzctx, cover);
|
||||
// LOG_WARNINGF(doc->filepath, "fz_load_page() returned error code [%d] %s", err, ctx->error.message)
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
fz_rect bounds = fz_bound_page(fzctx, cover);
|
||||
|
||||
float scale;
|
||||
float w = (float) bounds.x1 - bounds.x0;
|
||||
float h = (float) bounds.y1 - bounds.y0;
|
||||
if (w > h) {
|
||||
scale = (float) ctx->tn_size / w;
|
||||
} else {
|
||||
scale = (float) ctx->tn_size / h;
|
||||
}
|
||||
fz_matrix m = fz_scale(scale, scale);
|
||||
|
||||
bounds = fz_transform_rect(bounds, m);
|
||||
fz_irect bbox = fz_round_rect(bounds);
|
||||
fz_pixmap *pixmap = fz_new_pixmap_with_bbox(fzctx, fzctx->colorspace->rgb, bbox, NULL, 0);
|
||||
|
||||
fz_clear_pixmap_with_value(fzctx, pixmap, 0xFF);
|
||||
fz_device *dev = fz_new_draw_device(fzctx, m, pixmap);
|
||||
|
||||
fz_var(err);
|
||||
fz_try(fzctx)
|
||||
{
|
||||
pthread_mutex_lock(&ctx->mupdf_mutex);
|
||||
fz_run_page(fzctx, cover, dev, fz_identity, NULL);
|
||||
}
|
||||
fz_always(fzctx)
|
||||
{
|
||||
fz_close_device(fzctx, dev);
|
||||
fz_drop_device(fzctx, dev);
|
||||
pthread_mutex_unlock(&ctx->mupdf_mutex);
|
||||
}
|
||||
fz_catch(fzctx)
|
||||
err = fzctx->error.errcode;
|
||||
|
||||
if (err != 0) {
|
||||
// LOG_WARNINGF(doc->filepath, "fz_run_page() returned error code [%d] %s", err, ctx->error.message)
|
||||
fz_drop_page(fzctx, cover);
|
||||
fz_drop_pixmap(fzctx, pixmap);
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
fz_buffer *fzbuf = NULL;
|
||||
fz_var(fzbuf);
|
||||
fz_var(err);
|
||||
|
||||
fz_try(fzctx)
|
||||
fzbuf = fz_new_buffer_from_pixmap_as_png(fzctx, pixmap, fz_default_color_params);
|
||||
fz_catch(fzctx)
|
||||
err = fzctx->error.errcode;
|
||||
|
||||
if (err == 0) {
|
||||
unsigned char *tn_buf;
|
||||
size_t tn_len = fz_buffer_storage(fzctx, fzbuf, &tn_buf);
|
||||
// store_write(ScanCtx.index.store, (char *) doc->uuid, sizeof(doc->uuid), (char *) tn_buf, tn_len);
|
||||
}
|
||||
|
||||
fz_drop_buffer(fzctx, fzbuf);
|
||||
fz_drop_pixmap(fzctx, pixmap);
|
||||
fz_drop_page(fzctx, cover);
|
||||
|
||||
if (err != 0) {
|
||||
// LOG_WARNINGF(doc->filepath, "fz_new_buffer_from_pixmap_as_png() returned error code [%d] %s", err,
|
||||
// ctx->error.message)
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
void fz_err_callback(void *user, UNUSED(const char *message)) {
|
||||
// if (LogCtx.verbose) {
|
||||
// document_t *doc = (document_t *) user;
|
||||
// LOG_WARNINGF(doc->filepath, "FZ: %s", message)
|
||||
// }
|
||||
}
|
||||
|
||||
static void init_fzctx(fz_context *fzctx, document_t *doc) {
|
||||
fz_disable_icc(fzctx);
|
||||
fz_register_document_handlers(fzctx);
|
||||
|
||||
fzctx->warn.print_user = doc;
|
||||
fzctx->warn.print = fz_err_callback;
|
||||
fzctx->error.print_user = doc;
|
||||
fzctx->error.print = fz_err_callback;
|
||||
}
|
||||
|
||||
static int read_stext_block(fz_stext_block *block, text_buffer_t *tex) {
|
||||
if (block->type != FZ_STEXT_BLOCK_TEXT) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
fz_stext_line *line = block->u.t.first_line;
|
||||
while (line != NULL) {
|
||||
fz_stext_char *c = line->first_char;
|
||||
while (c != NULL) {
|
||||
if (text_buffer_append_char(tex, c->c) == TEXT_BUF_FULL) {
|
||||
return TEXT_BUF_FULL;
|
||||
}
|
||||
c = c->next;
|
||||
}
|
||||
line = line->next;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define IS_VALID_BPP(d) (d==1 || d==2 || d==4 || d==8 || d==16 || d==24 || d==32)
|
||||
|
||||
void fill_image(fz_context *fzctx, UNUSED(fz_device *dev),
|
||||
fz_image *img, UNUSED(fz_matrix ctm), UNUSED(float alpha),
|
||||
UNUSED(fz_color_params color_params)) {
|
||||
|
||||
int l2factor = 0;
|
||||
|
||||
if (img->w > MIN_OCR_SIZE && img->h > MIN_OCR_SIZE && IS_VALID_BPP(img->n)) {
|
||||
|
||||
fz_pixmap *pix = img->get_pixmap(fzctx, img, NULL, img->w, img->h, &l2factor);
|
||||
|
||||
if (pix->h > MIN_OCR_SIZE && img->h > MIN_OCR_SIZE && img->xres != 0) {
|
||||
TessBaseAPI *api = TessBaseAPICreate();
|
||||
TessBaseAPIInit3(api, thread_ctx.tesseract_path, thread_ctx.tesseract_lang);
|
||||
|
||||
TessBaseAPISetImage(api, pix->samples, pix->w, pix->h, pix->n, pix->stride);
|
||||
TessBaseAPISetSourceResolution(api, pix->xres);
|
||||
|
||||
char *text = TessBaseAPIGetUTF8Text(api);
|
||||
size_t len = strlen(text);
|
||||
if (len >= MIN_OCR_LEN) {
|
||||
text_buffer_append_string(&thread_buffer, text, len - 1);
|
||||
// LOG_DEBUGF(
|
||||
// "ebook.c",
|
||||
// "(OCR) %dx%d got %dB from tesseract (%s), buffer:%dB",
|
||||
// pix->w, pix->h, len, ScanCtx.tesseract_lang, thread_buffer.dyn_buffer.cur
|
||||
// )
|
||||
}
|
||||
|
||||
TessBaseAPIEnd(api);
|
||||
TessBaseAPIDelete(api);
|
||||
}
|
||||
fz_drop_pixmap(fzctx, pix);
|
||||
}
|
||||
}
|
||||
|
||||
void parse_ebook(scan_ebook_ctx_t *ctx, vfile_t *f, const char* mime_str, document_t *doc) {
|
||||
|
||||
size_t buf_len;
|
||||
void * buf = read_all(f, &buf_len);
|
||||
|
||||
static int mu_is_initialized = 0;
|
||||
if (!mu_is_initialized) {
|
||||
pthread_mutex_init(&ctx->mupdf_mutex, NULL);
|
||||
mu_is_initialized = 1;
|
||||
}
|
||||
fz_context *fzctx = fz_new_context(NULL, NULL, FZ_STORE_UNLIMITED);
|
||||
|
||||
init_fzctx(fzctx, doc);
|
||||
|
||||
int err = 0;
|
||||
|
||||
fz_document *fzdoc = NULL;
|
||||
fz_stream *stream = NULL;
|
||||
fz_var(fzdoc);
|
||||
fz_var(stream);
|
||||
fz_var(err);
|
||||
|
||||
fz_try(fzctx)
|
||||
{
|
||||
stream = fz_open_memory(fzctx, buf, buf_len);
|
||||
fzdoc = fz_open_document_with_stream(fzctx, mime_str, stream);
|
||||
}
|
||||
fz_catch(fzctx)
|
||||
err = fzctx->error.errcode;
|
||||
|
||||
if (err != 0) {
|
||||
fz_drop_stream(fzctx, stream);
|
||||
fz_drop_document(fzctx, fzdoc);
|
||||
fz_drop_context(fzctx);
|
||||
return;
|
||||
}
|
||||
|
||||
char title[4096] = {'\0',};
|
||||
fz_try(fzctx)
|
||||
fz_lookup_metadata(fzctx, fzdoc, FZ_META_INFO_TITLE, title, sizeof(title));
|
||||
fz_catch(fzctx)
|
||||
;
|
||||
|
||||
if (strlen(title) > 0) {
|
||||
meta_line_t *meta_content = malloc(sizeof(meta_line_t) + strlen(title));
|
||||
meta_content->key = MetaTitle;
|
||||
strcpy(meta_content->str_val, title);
|
||||
APPEND_META(doc, meta_content)
|
||||
}
|
||||
|
||||
int page_count = -1;
|
||||
fz_var(err);
|
||||
fz_try(fzctx)
|
||||
page_count = fz_count_pages(fzctx, fzdoc);
|
||||
fz_catch(fzctx)
|
||||
err = fzctx->error.errcode;
|
||||
|
||||
if (err) {
|
||||
// LOG_WARNINGF(doc->filepath, "fz_count_pages() returned error code [%d] %s", err, ctx->error.message)
|
||||
fz_drop_stream(fzctx, stream);
|
||||
fz_drop_document(fzctx, fzdoc);
|
||||
fz_drop_context(fzctx);
|
||||
return;
|
||||
}
|
||||
|
||||
if (ctx->tn_size > 0) {
|
||||
err = render_cover(ctx, fzctx, doc, fzdoc);
|
||||
}
|
||||
|
||||
if (err == TRUE) {
|
||||
fz_drop_stream(fzctx, stream);
|
||||
fz_drop_document(fzctx, fzdoc);
|
||||
fz_drop_context(fzctx);
|
||||
return;
|
||||
}
|
||||
|
||||
if (ctx->content_size > 0) {
|
||||
fz_stext_options opts = {0};
|
||||
thread_buffer = text_buffer_create(ctx->content_size);
|
||||
|
||||
for (int current_page = 0; current_page < page_count; current_page++) {
|
||||
fz_page *page = NULL;
|
||||
fz_var(err);
|
||||
fz_try(fzctx)
|
||||
page = fz_load_page(fzctx, fzdoc, current_page);
|
||||
fz_catch(fzctx)
|
||||
err = fzctx->error.errcode;
|
||||
if (err != 0) {
|
||||
// LOG_WARNINGF(doc->filepath, "fz_load_page() returned error code [%d] %s", err, ctx->error.message)
|
||||
text_buffer_destroy(&thread_buffer);
|
||||
fz_drop_page(fzctx, page);
|
||||
fz_drop_stream(fzctx, stream);
|
||||
fz_drop_document(fzctx, fzdoc);
|
||||
fz_drop_context(fzctx);
|
||||
return;
|
||||
}
|
||||
|
||||
fz_stext_page *stext = fz_new_stext_page(fzctx, fz_bound_page(fzctx, page));
|
||||
fz_device *dev = fz_new_stext_device(fzctx, stext, &opts);
|
||||
dev->stroke_path = NULL;
|
||||
dev->stroke_text = NULL;
|
||||
dev->clip_text = NULL;
|
||||
dev->clip_stroke_path = NULL;
|
||||
dev->clip_stroke_text = NULL;
|
||||
|
||||
if (ctx->tesseract_lang!= NULL) {
|
||||
dev->fill_image = fill_image;
|
||||
}
|
||||
|
||||
fz_var(err);
|
||||
fz_try(fzctx)
|
||||
fz_run_page(fzctx, page, dev, fz_identity, NULL);
|
||||
fz_always(fzctx)
|
||||
{
|
||||
fz_close_device(fzctx, dev);
|
||||
fz_drop_device(fzctx, dev);
|
||||
}
|
||||
fz_catch(fzctx)
|
||||
err = fzctx->error.errcode;
|
||||
|
||||
if (err != 0) {
|
||||
// LOG_WARNINGF(doc->filepath, "fz_run_page() returned error code [%d] %s", err, ctx->error.message)
|
||||
text_buffer_destroy(&thread_buffer);
|
||||
fz_drop_page(fzctx, page);
|
||||
fz_drop_stext_page(fzctx, stext);
|
||||
fz_drop_stream(fzctx, stream);
|
||||
fz_drop_document(fzctx, fzdoc);
|
||||
fz_drop_context(fzctx);
|
||||
return;
|
||||
}
|
||||
|
||||
fz_stext_block *block = stext->first_block;
|
||||
while (block != NULL) {
|
||||
int ret = read_stext_block(block, &thread_buffer);
|
||||
if (ret == TEXT_BUF_FULL) {
|
||||
break;
|
||||
}
|
||||
block = block->next;
|
||||
}
|
||||
fz_drop_stext_page(fzctx, stext);
|
||||
fz_drop_page(fzctx, page);
|
||||
|
||||
if (thread_buffer.dyn_buffer.cur >= thread_buffer.dyn_buffer.size) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
text_buffer_terminate_string(&thread_buffer);
|
||||
|
||||
meta_line_t *meta_content = malloc(sizeof(meta_line_t) + thread_buffer.dyn_buffer.cur);
|
||||
meta_content->key = MetaContent;
|
||||
memcpy(meta_content->str_val, thread_buffer.dyn_buffer.buf, thread_buffer.dyn_buffer.cur);
|
||||
APPEND_META(doc, meta_content)
|
||||
|
||||
text_buffer_destroy(&thread_buffer);
|
||||
}
|
||||
|
||||
fz_drop_stream(fzctx, stream);
|
||||
fz_drop_document(fzctx, fzdoc);
|
||||
fz_drop_context(fzctx);
|
||||
}
|
16
third-party/libscan/libscan/ebook/ebook.h
vendored
16
third-party/libscan/libscan/ebook/ebook.h
vendored
@ -1,16 +0,0 @@
|
||||
#ifndef SCAN_EBOOK_H
|
||||
#define SCAN_EBOOK_H
|
||||
|
||||
#include "../scan.h"
|
||||
|
||||
typedef struct {
|
||||
long content_size;
|
||||
int tn_size;
|
||||
const char *tesseract_lang;
|
||||
const char *tesseract_path;
|
||||
pthread_mutex_t mupdf_mutex;
|
||||
} scan_ebook_ctx_t;
|
||||
|
||||
void parse_ebook(scan_ebook_ctx_t *ctx, vfile_t *f, const char* mime_str, document_t *doc);
|
||||
|
||||
#endif
|
234
third-party/libscan/libscan/font/font.c
vendored
234
third-party/libscan/libscan/font/font.c
vendored
@ -1,234 +0,0 @@
|
||||
#include "font.h"
|
||||
|
||||
#include <ft2build.h>
|
||||
#include <freetype/freetype.h>
|
||||
#include "../util.h"
|
||||
|
||||
|
||||
__thread FT_Library ft_lib = NULL;
|
||||
|
||||
|
||||
typedef struct text_dimensions {
|
||||
unsigned int width;
|
||||
unsigned int height;
|
||||
unsigned int baseline;
|
||||
} text_dimensions_t;
|
||||
|
||||
typedef struct glyph {
|
||||
int top;
|
||||
int height;
|
||||
int width;
|
||||
int descent;
|
||||
int ascent;
|
||||
int advance_width;
|
||||
unsigned char *pixmap;
|
||||
} glyph_t;
|
||||
|
||||
|
||||
__always_inline
|
||||
int kerning_offset(char c, char pc, FT_Face face) {
|
||||
FT_Vector kerning;
|
||||
FT_Get_Kerning(face, c, pc, FT_KERNING_DEFAULT, &kerning);
|
||||
|
||||
return (int) (kerning.x / 64);
|
||||
}
|
||||
|
||||
__always_inline
|
||||
glyph_t ft_glyph_to_glyph(FT_GlyphSlot slot) {
|
||||
glyph_t glyph;
|
||||
|
||||
glyph.pixmap = slot->bitmap.buffer;
|
||||
|
||||
glyph.width = (int) slot->bitmap.width;
|
||||
glyph.height = (int) slot->bitmap.rows;
|
||||
glyph.top = slot->bitmap_top;
|
||||
glyph.advance_width = (int) slot->advance.x / 64;
|
||||
|
||||
glyph.descent = MAX(0, glyph.height - glyph.top);
|
||||
glyph.ascent = MAX(0, MAX(glyph.top, glyph.height) - glyph.descent);
|
||||
|
||||
return glyph;
|
||||
}
|
||||
|
||||
text_dimensions_t text_dimension(char *text, FT_Face face) {
|
||||
text_dimensions_t dimensions;
|
||||
|
||||
dimensions.width = 0;
|
||||
|
||||
int num_chars = (int) strlen(text);
|
||||
|
||||
unsigned int max_ascent = 0;
|
||||
int max_descent = 0;
|
||||
|
||||
char pc = 0;
|
||||
for (int i = 0; i < num_chars; i++) {
|
||||
char c = text[i];
|
||||
|
||||
FT_Load_Char(face, c, 0);
|
||||
glyph_t glyph = ft_glyph_to_glyph(face->glyph);
|
||||
|
||||
max_descent = MAX(max_descent, glyph.descent);
|
||||
max_ascent = MAX(max_ascent, MAX(glyph.height, glyph.ascent));
|
||||
|
||||
int kerning_x = kerning_offset(c, pc, face);
|
||||
dimensions.width += MAX(glyph.advance_width, glyph.width) + kerning_x;
|
||||
|
||||
pc = c;
|
||||
}
|
||||
|
||||
dimensions.height = max_ascent + max_descent;
|
||||
dimensions.baseline = max_descent;
|
||||
|
||||
return dimensions;
|
||||
}
|
||||
|
||||
void draw_glyph(glyph_t *glyph, int x, int y, struct text_dimensions text_info, unsigned char *bitmap) {
|
||||
unsigned int src = 0;
|
||||
unsigned int dst = y * text_info.width + x;
|
||||
unsigned int row_offset = text_info.width - glyph->width;
|
||||
unsigned int buf_len = text_info.width * text_info.height;
|
||||
|
||||
for (unsigned int sy = 0; sy < glyph->height; sy++) {
|
||||
for (unsigned int sx = 0; sx < glyph->width; sx++) {
|
||||
if (dst < buf_len) {
|
||||
bitmap[dst] |= glyph->pixmap[src];
|
||||
}
|
||||
src++;
|
||||
dst++;
|
||||
}
|
||||
dst += row_offset;
|
||||
}
|
||||
}
|
||||
|
||||
void bmp_format(dyn_buffer_t *buf, text_dimensions_t dimensions, const unsigned char *bitmap) {
|
||||
|
||||
dyn_buffer_write_short(buf, 0x4D42); // Magic
|
||||
dyn_buffer_write_int(buf, 0); // Size placeholder
|
||||
dyn_buffer_write_int(buf, 0x5157); //Reserved
|
||||
dyn_buffer_write_int(buf, 14 + 40 + 256 * 4); // pixels offset
|
||||
|
||||
dyn_buffer_write_int(buf, 40); // DIB size
|
||||
dyn_buffer_write_int(buf, (int) dimensions.width);
|
||||
dyn_buffer_write_int(buf, (int) dimensions.height);
|
||||
dyn_buffer_write_short(buf, 1); // Color planes
|
||||
dyn_buffer_write_short(buf, 8); // bits per pixel
|
||||
dyn_buffer_write_int(buf, 0); // compression
|
||||
dyn_buffer_write_int(buf, 0); // Ignored
|
||||
dyn_buffer_write_int(buf, 3800); // hres
|
||||
dyn_buffer_write_int(buf, 3800); // vres
|
||||
dyn_buffer_write_int(buf, 256); // Color count
|
||||
dyn_buffer_write_int(buf, 0); // Ignored
|
||||
|
||||
// RGBA32 Color table (Grayscale)
|
||||
for (int i = 255; i >= 0; i--) {
|
||||
dyn_buffer_write_int(buf, i + (i << 8) + (i << 16));
|
||||
}
|
||||
|
||||
// Pixel array: write from bottom to top, with rows padded to multiples of 4-bytes
|
||||
for (int y = (int) dimensions.height - 1; y >= 0; y--) {
|
||||
for (unsigned int x = 0; x < dimensions.width; x++) {
|
||||
dyn_buffer_write_char(buf, (char) bitmap[y * dimensions.width + x]);
|
||||
}
|
||||
while (buf->cur % 4 != 0) {
|
||||
dyn_buffer_write_char(buf, 0);
|
||||
}
|
||||
}
|
||||
|
||||
// Size
|
||||
*(int *) ((char *) buf->buf + 2) = buf->cur;
|
||||
}
|
||||
|
||||
void parse_font(scan_font_cxt_t *ctx, vfile_t *f, document_t *doc) {
|
||||
if (ft_lib == NULL) {
|
||||
FT_Init_FreeType(&ft_lib);
|
||||
}
|
||||
|
||||
size_t buf_len;
|
||||
void * buf = read_all(f, &buf_len);
|
||||
|
||||
FT_Face face;
|
||||
FT_Error err = FT_New_Memory_Face(ft_lib, (unsigned char *) buf, buf_len, 0, &face);
|
||||
if (err != 0) {
|
||||
// LOG_ERRORF(doc->filepath, "(font.c) FT_New_Memory_Face() returned error code [%d] %s", err, ft_error_string(err));
|
||||
return;
|
||||
}
|
||||
|
||||
char font_name[1024];
|
||||
|
||||
if (face->style_name == NULL || *(face->style_name) == '?') {
|
||||
if (face->family_name == NULL) {
|
||||
strcpy(font_name, "(null)");
|
||||
} else {
|
||||
strcpy(font_name, face->family_name);
|
||||
}
|
||||
} else {
|
||||
snprintf(font_name, sizeof(font_name), "%s %s", face->family_name, face->style_name);
|
||||
}
|
||||
|
||||
meta_line_t *meta_name = malloc(sizeof(meta_line_t) + strlen(font_name));
|
||||
meta_name->key = MetaFontName;
|
||||
strcpy(meta_name->str_val, font_name);
|
||||
APPEND_META(doc, meta_name)
|
||||
|
||||
if (ctx->enable_tn == TRUE) {
|
||||
FT_Done_Face(face);
|
||||
return;
|
||||
}
|
||||
|
||||
int pixel = 64;
|
||||
int num_chars = (int) strlen(font_name);
|
||||
|
||||
err = FT_Set_Pixel_Sizes(face, 0, pixel);
|
||||
if (err != 0) {
|
||||
// LOG_WARNINGF(doc->filepath, "(font.c) FT_Set_Pixel_Sizes() returned error code [%d] %s", err, ft_error_string(err))
|
||||
FT_Done_Face(face);
|
||||
return;
|
||||
}
|
||||
|
||||
text_dimensions_t dimensions = text_dimension(font_name, face);
|
||||
unsigned char *bitmap = calloc(dimensions.width * dimensions.height, 1);
|
||||
|
||||
FT_Vector pen;
|
||||
pen.x = 0;
|
||||
|
||||
char pc = 0;
|
||||
for (int i = 0; i < num_chars; i++) {
|
||||
char c = font_name[i];
|
||||
|
||||
err = FT_Load_Char(face, c, FT_LOAD_NO_HINTING | FT_LOAD_RENDER);
|
||||
if (err != 0) {
|
||||
c = c >= 'a' && c <= 'z' ? c - 32 : c + 32;
|
||||
err = FT_Load_Char(face, c, FT_LOAD_NO_HINTING | FT_LOAD_RENDER);
|
||||
if (err != 0) {
|
||||
// LOG_WARNINGF(doc->filepath, "(font.c) FT_Load_Char() returned error code [%d] %s", err, ft_error_string(err));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
glyph_t glyph = ft_glyph_to_glyph(face->glyph);
|
||||
|
||||
pen.x += kerning_offset(c, pc, face);
|
||||
if (pen.x <= 0) {
|
||||
pen.x = ABS(glyph.advance_width - glyph.width);
|
||||
}
|
||||
pen.y = dimensions.height - glyph.ascent - dimensions.baseline;
|
||||
|
||||
draw_glyph(&glyph, pen.x, pen.y, dimensions, bitmap);
|
||||
|
||||
pen.x += glyph.advance_width;
|
||||
pc = c;
|
||||
}
|
||||
|
||||
dyn_buffer_t bmp_data = dyn_buffer_create();
|
||||
bmp_format(&bmp_data, dimensions, bitmap);
|
||||
|
||||
// store_write(ScanCtx.index.store, (char *) doc->uuid, sizeof(doc->uuid), (char *) bmp_data.buf, bmp_data.cur);
|
||||
|
||||
dyn_buffer_destroy(&bmp_data);
|
||||
free(bitmap);
|
||||
|
||||
FT_Done_Face(face);
|
||||
}
|
||||
|
||||
void cleanup_font() {
|
||||
FT_Done_FreeType(ft_lib);
|
||||
}
|
14
third-party/libscan/libscan/font/font.h
vendored
14
third-party/libscan/libscan/font/font.h
vendored
@ -1,14 +0,0 @@
|
||||
#ifndef SCAN_FONT_H
|
||||
#define SCAN_FONT_H
|
||||
|
||||
#include "../scan.h"
|
||||
|
||||
|
||||
typedef struct {
|
||||
int enable_tn;
|
||||
} scan_font_cxt_t;
|
||||
|
||||
void parse_font(scan_font_cxt_t *ctx, vfile_t *f, document_t *doc);
|
||||
void cleanup_font();
|
||||
|
||||
#endif
|
21
third-party/libscan/libscan/macros.h
vendored
21
third-party/libscan/libscan/macros.h
vendored
@ -1,21 +0,0 @@
|
||||
#ifndef FALSE
|
||||
#define FALSE (0)
|
||||
#define BOOL int
|
||||
#endif
|
||||
|
||||
#ifndef TRUE
|
||||
#define TRUE (!FALSE)
|
||||
#endif
|
||||
|
||||
#undef MAX
|
||||
#define MAX(a, b) (((a) > (b)) ? (a) : (b))
|
||||
|
||||
#undef MIN
|
||||
#define MIN(a, b) (((a) < (b)) ? (a) : (b))
|
||||
|
||||
#ifndef PATH_MAX
|
||||
#define PATH_MAX 4096
|
||||
#endif
|
||||
|
||||
#undef ABS
|
||||
#define ABS(a) (((a) < 0) ? -(a) : (a))
|
419
third-party/libscan/libscan/media/media.c
vendored
419
third-party/libscan/libscan/media/media.c
vendored
@ -1,419 +0,0 @@
|
||||
#include "media.h"
|
||||
|
||||
#include "../util.h"
|
||||
|
||||
#include "libavformat/avformat.h"
|
||||
#include "libswscale/swscale.h"
|
||||
#include "libswresample/swresample.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavutil/imgutils.h"
|
||||
|
||||
#include <ctype.h>
|
||||
|
||||
#define MIN_SIZE 32
|
||||
#define AVIO_BUF_SIZE 8192
|
||||
|
||||
__always_inline
|
||||
static AVCodecContext *alloc_jpeg_encoder(int dstW, int dstH, float qscale) {
|
||||
|
||||
AVCodec *jpeg_codec = avcodec_find_encoder(AV_CODEC_ID_MJPEG);
|
||||
AVCodecContext *jpeg = avcodec_alloc_context3(jpeg_codec);
|
||||
jpeg->width = dstW;
|
||||
jpeg->height = dstH;
|
||||
jpeg->time_base.den = 1000000;
|
||||
jpeg->time_base.num = 1;
|
||||
jpeg->i_quant_factor = qscale;
|
||||
|
||||
jpeg->pix_fmt = AV_PIX_FMT_YUVJ420P;
|
||||
int ret = avcodec_open2(jpeg, jpeg_codec, NULL);
|
||||
|
||||
if (ret != 0) {
|
||||
printf("Could not open jpeg encoder: %s!\n", av_err2str(ret));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return jpeg;
|
||||
}
|
||||
|
||||
__always_inline
|
||||
AVFrame *scale_frame(const AVCodecContext *decoder, const AVFrame *frame, int size) {
|
||||
|
||||
int dstW;
|
||||
int dstH;
|
||||
if (frame->width <= size && frame->height <= size) {
|
||||
dstW = frame->width;
|
||||
dstH = frame->height;
|
||||
} else {
|
||||
double ratio = (double) frame->width / frame->height;
|
||||
if (frame->width > frame->height) {
|
||||
dstW = size;
|
||||
dstH = (int) (size / ratio);
|
||||
} else {
|
||||
dstW = (int) (size * ratio);
|
||||
dstH = size;
|
||||
}
|
||||
}
|
||||
|
||||
if (dstW <= MIN_SIZE || dstH <= MIN_SIZE) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
AVFrame *scaled_frame = av_frame_alloc();
|
||||
|
||||
struct SwsContext *ctx = sws_getContext(
|
||||
decoder->width, decoder->height, decoder->pix_fmt,
|
||||
dstW, dstH, AV_PIX_FMT_YUVJ420P,
|
||||
SWS_FAST_BILINEAR, 0, 0, 0
|
||||
);
|
||||
|
||||
int dst_buf_len = av_image_get_buffer_size(AV_PIX_FMT_YUV420P, dstW, dstH, 1);
|
||||
uint8_t *dst_buf = (uint8_t *) av_malloc(dst_buf_len);
|
||||
|
||||
av_image_fill_arrays(scaled_frame->data, scaled_frame->linesize, dst_buf, AV_PIX_FMT_YUV420P, dstW, dstH, 1);
|
||||
|
||||
sws_scale(ctx,
|
||||
(const uint8_t *const *) frame->data, frame->linesize,
|
||||
0, decoder->height,
|
||||
scaled_frame->data, scaled_frame->linesize
|
||||
);
|
||||
|
||||
scaled_frame->width = dstW;
|
||||
scaled_frame->height = dstH;
|
||||
scaled_frame->format = AV_PIX_FMT_YUV420P;
|
||||
|
||||
sws_freeContext(ctx);
|
||||
|
||||
return scaled_frame;
|
||||
}
|
||||
|
||||
__always_inline
|
||||
static AVFrame *read_frame(AVFormatContext *pFormatCtx, AVCodecContext *decoder, int stream_idx, document_t *doc) {
|
||||
AVFrame *frame = av_frame_alloc();
|
||||
|
||||
AVPacket avPacket;
|
||||
av_init_packet(&avPacket);
|
||||
|
||||
int receive_ret = -EAGAIN;
|
||||
while (receive_ret == -EAGAIN) {
|
||||
// Get video frame
|
||||
while (1) {
|
||||
int read_frame_ret = av_read_frame(pFormatCtx, &avPacket);
|
||||
|
||||
if (read_frame_ret != 0) {
|
||||
if (read_frame_ret != AVERROR_EOF) {
|
||||
// LOG_WARNINGF(doc->filepath,
|
||||
// "(media.c) avcodec_read_frame() returned error code [%d] %s",
|
||||
// read_frame_ret, av_err2str(read_frame_ret)
|
||||
// )
|
||||
}
|
||||
av_frame_free(&frame);
|
||||
av_packet_unref(&avPacket);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
//Ignore audio/other frames
|
||||
if (avPacket.stream_index != stream_idx) {
|
||||
av_packet_unref(&avPacket);
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
// Feed it to decoder
|
||||
int decode_ret = avcodec_send_packet(decoder, &avPacket);
|
||||
if (decode_ret != 0) {
|
||||
// LOG_ERRORF(doc->filepath,
|
||||
// "(media.c) avcodec_send_packet() returned error code [%d] %s",
|
||||
// decode_ret, av_err2str(decode_ret)
|
||||
// )
|
||||
av_frame_free(&frame);
|
||||
av_packet_unref(&avPacket);
|
||||
return NULL;
|
||||
}
|
||||
av_packet_unref(&avPacket);
|
||||
receive_ret = avcodec_receive_frame(decoder, frame);
|
||||
}
|
||||
return frame;
|
||||
}
|
||||
|
||||
#define APPEND_TAG_META(doc, tag_, keyname) \
|
||||
text_buffer_t tex = text_buffer_create(-1); \
|
||||
text_buffer_append_string0(&tex, tag_->value); \
|
||||
text_buffer_terminate_string(&tex); \
|
||||
meta_line_t *meta_tag = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur); \
|
||||
meta_tag->key = keyname; \
|
||||
strcpy(meta_tag->str_val, tex.dyn_buffer.buf); \
|
||||
APPEND_META(doc, meta_tag) \
|
||||
text_buffer_destroy(&tex);
|
||||
|
||||
__always_inline
|
||||
static void append_audio_meta(AVFormatContext *pFormatCtx, document_t *doc) {
|
||||
|
||||
AVDictionaryEntry *tag = NULL;
|
||||
while ((tag = av_dict_get(pFormatCtx->metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) {
|
||||
char key[256];
|
||||
strncpy(key, tag->key, sizeof(key));
|
||||
|
||||
char *ptr = key;
|
||||
for (; *ptr; ++ptr) *ptr = (char) tolower(*ptr);
|
||||
|
||||
if (strcmp(key, "artist") == 0) {
|
||||
APPEND_TAG_META(doc, tag, MetaArtist)
|
||||
} else if (strcmp(key, "genre") == 0) {
|
||||
APPEND_TAG_META(doc, tag, MetaGenre)
|
||||
} else if (strcmp(key, "title") == 0) {
|
||||
APPEND_TAG_META(doc, tag, MetaTitle)
|
||||
} else if (strcmp(key, "album_artist") == 0) {
|
||||
APPEND_TAG_META(doc, tag, MetaAlbumArtist)
|
||||
} else if (strcmp(key, "album") == 0) {
|
||||
APPEND_TAG_META(doc, tag, MetaAlbum)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__always_inline
|
||||
static void
|
||||
append_video_meta(AVFormatContext *pFormatCtx, AVFrame *frame, document_t *doc, int include_audio_tags, int is_video) {
|
||||
|
||||
if (is_video) {
|
||||
meta_line_t *meta_duration = malloc(sizeof(meta_line_t));
|
||||
meta_duration->key = MetaMediaDuration;
|
||||
meta_duration->long_val = pFormatCtx->duration / AV_TIME_BASE;
|
||||
APPEND_META(doc, meta_duration)
|
||||
|
||||
meta_line_t *meta_bitrate = malloc(sizeof(meta_line_t));
|
||||
meta_bitrate->key = MetaMediaBitrate;
|
||||
meta_bitrate->long_val = pFormatCtx->bit_rate;
|
||||
APPEND_META(doc, meta_bitrate)
|
||||
}
|
||||
|
||||
AVDictionaryEntry *tag = NULL;
|
||||
if (is_video) {
|
||||
while ((tag = av_dict_get(pFormatCtx->metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) {
|
||||
if (include_audio_tags && strcmp(tag->key, "title") == 0) {
|
||||
APPEND_TAG_META(doc, tag, MetaTitle)
|
||||
} else if (strcmp(tag->key, "comment") == 0) {
|
||||
APPEND_TAG_META(doc, tag, MetaContent)
|
||||
} else if (include_audio_tags && strcmp(tag->key, "artist") == 0) {
|
||||
APPEND_TAG_META(doc, tag, MetaArtist)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// EXIF metadata
|
||||
while ((tag = av_dict_get(frame->metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) {
|
||||
if (include_audio_tags && strcmp(tag->key, "Artist") == 0) {
|
||||
APPEND_TAG_META(doc, tag, MetaArtist)
|
||||
} else if (strcmp(tag->key, "ImageDescription") == 0) {
|
||||
APPEND_TAG_META(doc, tag, MetaContent)
|
||||
} else if (strcmp(tag->key, "Make") == 0) {
|
||||
APPEND_TAG_META(doc, tag, MetaExifMake)
|
||||
} else if (strcmp(tag->key, "Model") == 0) {
|
||||
APPEND_TAG_META(doc, tag, MetaExifModel)
|
||||
} else if (strcmp(tag->key, "Software") == 0) {
|
||||
APPEND_TAG_META(doc, tag, MetaExifSoftware)
|
||||
} else if (strcmp(tag->key, "FNumber") == 0) {
|
||||
APPEND_TAG_META(doc, tag, MetaExifFNumber)
|
||||
} else if (strcmp(tag->key, "FocalLength") == 0) {
|
||||
APPEND_TAG_META(doc, tag, MetaExifFocalLength)
|
||||
} else if (strcmp(tag->key, "UserComment") == 0) {
|
||||
APPEND_TAG_META(doc, tag, MetaExifUserComment)
|
||||
} else if (strcmp(tag->key, "ISOSpeedRatings") == 0) {
|
||||
APPEND_TAG_META(doc, tag, MetaExifIsoSpeedRatings)
|
||||
} else if (strcmp(tag->key, "ExposureTime") == 0) {
|
||||
APPEND_TAG_META(doc, tag, MetaExifExposureTime)
|
||||
} else if (strcmp(tag->key, "DateTime") == 0) {
|
||||
APPEND_TAG_META(doc, tag, MetaExifDateTime)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void parse_media_format_ctx(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, document_t *doc) {
|
||||
|
||||
int video_stream = -1;
|
||||
int audio_stream = -1;
|
||||
|
||||
avformat_find_stream_info(pFormatCtx, NULL);
|
||||
|
||||
for (int i = (int) pFormatCtx->nb_streams - 1; i >= 0; i--) {
|
||||
AVStream *stream = pFormatCtx->streams[i];
|
||||
|
||||
if (stream->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
|
||||
if (audio_stream == -1) {
|
||||
meta_line_t *meta_audio = malloc(sizeof(meta_line_t));
|
||||
meta_audio->key = MetaMediaAudioCodec;
|
||||
meta_audio->int_val = stream->codecpar->codec_id;
|
||||
APPEND_META(doc, meta_audio)
|
||||
|
||||
append_audio_meta(pFormatCtx, doc);
|
||||
audio_stream = i;
|
||||
}
|
||||
} else if (stream->codecpar->codec_type == AVMEDIA_TYPE_VIDEO) {
|
||||
|
||||
if (video_stream == -1) {
|
||||
meta_line_t *meta_vid = malloc(sizeof(meta_line_t));
|
||||
meta_vid->key = MetaMediaVideoCodec;
|
||||
meta_vid->int_val = stream->codecpar->codec_id;
|
||||
APPEND_META(doc, meta_vid)
|
||||
|
||||
meta_line_t *meta_w = malloc(sizeof(meta_line_t));
|
||||
meta_w->key = MetaWidth;
|
||||
meta_w->int_val = stream->codecpar->width;
|
||||
APPEND_META(doc, meta_w)
|
||||
|
||||
meta_line_t *meta_h = malloc(sizeof(meta_line_t));
|
||||
meta_h->key = MetaHeight;
|
||||
meta_h->int_val = stream->codecpar->height;
|
||||
APPEND_META(doc, meta_h)
|
||||
|
||||
video_stream = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (video_stream != -1 && ctx->tn_size > 0) {
|
||||
AVStream *stream = pFormatCtx->streams[video_stream];
|
||||
|
||||
if (stream->codecpar->width <= MIN_SIZE || stream->codecpar->height <= MIN_SIZE) {
|
||||
avformat_close_input(&pFormatCtx);
|
||||
avformat_free_context(pFormatCtx);
|
||||
return;
|
||||
}
|
||||
|
||||
// Decoder
|
||||
AVCodec *video_codec = avcodec_find_decoder(stream->codecpar->codec_id);
|
||||
AVCodecContext *decoder = avcodec_alloc_context3(video_codec);
|
||||
avcodec_parameters_to_context(decoder, stream->codecpar);
|
||||
avcodec_open2(decoder, video_codec, NULL);
|
||||
|
||||
//Seek
|
||||
if (stream->nb_frames > 1 && stream->codecpar->codec_id != AV_CODEC_ID_GIF) {
|
||||
int seek_ret = 0;
|
||||
for (int i = 20; i >= 0; i--) {
|
||||
seek_ret = av_seek_frame(pFormatCtx, video_stream,
|
||||
stream->duration * 0.10, 0);
|
||||
if (seek_ret == 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
AVFrame *frame = read_frame(pFormatCtx, decoder, video_stream, doc);
|
||||
if (frame == NULL) {
|
||||
avcodec_free_context(&decoder);
|
||||
avformat_close_input(&pFormatCtx);
|
||||
avformat_free_context(pFormatCtx);
|
||||
return;
|
||||
}
|
||||
|
||||
append_video_meta(pFormatCtx, frame, doc, audio_stream == -1, stream->nb_frames > 1);
|
||||
|
||||
// Scale frame
|
||||
AVFrame *scaled_frame = scale_frame(decoder, frame, ctx->tn_size);
|
||||
|
||||
if (scaled_frame == NULL) {
|
||||
av_frame_free(&frame);
|
||||
avcodec_free_context(&decoder);
|
||||
avformat_close_input(&pFormatCtx);
|
||||
avformat_free_context(pFormatCtx);
|
||||
return;
|
||||
}
|
||||
|
||||
// Encode frame to jpeg
|
||||
AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(scaled_frame->width, scaled_frame->height, ctx->tn_qscale);
|
||||
avcodec_send_frame(jpeg_encoder, scaled_frame);
|
||||
|
||||
AVPacket jpeg_packet;
|
||||
av_init_packet(&jpeg_packet);
|
||||
avcodec_receive_packet(jpeg_encoder, &jpeg_packet);
|
||||
|
||||
// Save thumbnail
|
||||
// store_write(ScanCtx.index.store, (char *) doc->uuid, sizeof(doc->uuid), (char *) jpeg_packet.data,
|
||||
// jpeg_packet.size);
|
||||
|
||||
av_packet_unref(&jpeg_packet);
|
||||
av_frame_free(&frame);
|
||||
av_free(*scaled_frame->data);
|
||||
av_frame_free(&scaled_frame);
|
||||
avcodec_free_context(&jpeg_encoder);
|
||||
avcodec_free_context(&decoder);
|
||||
}
|
||||
|
||||
avformat_close_input(&pFormatCtx);
|
||||
avformat_free_context(pFormatCtx);
|
||||
}
|
||||
|
||||
void parse_media_filename(scan_media_ctx_t *ctx, const char *filepath, document_t *doc) {
|
||||
|
||||
AVFormatContext *pFormatCtx = avformat_alloc_context();
|
||||
if (pFormatCtx == NULL) {
|
||||
// LOG_ERROR(doc->filepath, "(media.c) Could not allocate context with avformat_alloc_context()")
|
||||
return;
|
||||
}
|
||||
int res = avformat_open_input(&pFormatCtx, filepath, NULL, NULL);
|
||||
if (res < 0) {
|
||||
// LOG_ERRORF(doc->filepath, "(media.c) avformat_open_input() returned [%d] %s", res, av_err2str(res))
|
||||
avformat_close_input(&pFormatCtx);
|
||||
avformat_free_context(pFormatCtx);
|
||||
return;
|
||||
}
|
||||
|
||||
parse_media_format_ctx(ctx, pFormatCtx, doc);
|
||||
}
|
||||
|
||||
|
||||
int vfile_read(void *ptr, uint8_t *buf, int buf_size) {
|
||||
struct vfile *f = ptr;
|
||||
|
||||
int ret = f->read(f, buf, buf_size);
|
||||
|
||||
if (ret == 0) {
|
||||
return AVERROR_EOF;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
void parse_media_vfile(scan_media_ctx_t *ctx, struct vfile *f, document_t *doc) {
|
||||
|
||||
AVFormatContext *pFormatCtx = avformat_alloc_context();
|
||||
if (pFormatCtx == NULL) {
|
||||
// LOG_ERROR(doc->filepath, "(media.c) Could not allocate context with avformat_alloc_context()")
|
||||
return;
|
||||
}
|
||||
|
||||
unsigned char *buffer = (unsigned char *) av_malloc(AVIO_BUF_SIZE);
|
||||
AVIOContext *io_ctx = avio_alloc_context(buffer, AVIO_BUF_SIZE, 0, f, vfile_read, NULL, NULL);
|
||||
|
||||
pFormatCtx->pb = io_ctx;
|
||||
pFormatCtx->flags |= AVFMT_FLAG_CUSTOM_IO;
|
||||
|
||||
int res = avformat_open_input(&pFormatCtx, "", NULL, NULL);
|
||||
if (res == -5) {
|
||||
// Tried to parse media that requires seek
|
||||
av_free(io_ctx->buffer);
|
||||
avio_context_free(&io_ctx);
|
||||
avformat_close_input(&pFormatCtx);
|
||||
avformat_free_context(pFormatCtx);
|
||||
return;
|
||||
} else if (res < 0) {
|
||||
// LOG_ERRORF(doc->filepath, "(media.c) avformat_open_input() returned [%d] %s", res, av_err2str(res))
|
||||
av_free(io_ctx->buffer);
|
||||
avio_context_free(&io_ctx);
|
||||
avformat_close_input(&pFormatCtx);
|
||||
avformat_free_context(pFormatCtx);
|
||||
return;
|
||||
}
|
||||
|
||||
parse_media_format_ctx(ctx, pFormatCtx, doc);
|
||||
av_free(io_ctx->buffer);
|
||||
avio_context_free(&io_ctx);
|
||||
}
|
||||
|
||||
void parse_media(scan_media_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
|
||||
if (f->is_fs_file) {
|
||||
parse_media_filename(ctx, f->filepath, doc);
|
||||
} else {
|
||||
parse_media_vfile(ctx, f, doc);
|
||||
}
|
||||
}
|
18
third-party/libscan/libscan/media/media.h
vendored
18
third-party/libscan/libscan/media/media.h
vendored
@ -1,18 +0,0 @@
|
||||
#ifndef SIST2_MEDIA_H
|
||||
#define SIST2_MEDIA_H
|
||||
|
||||
|
||||
#include "../scan.h"
|
||||
|
||||
#define MIN_VIDEO_SIZE 1024 * 64
|
||||
#define MIN_IMAGE_SIZE 1024 * 2
|
||||
|
||||
typedef struct {
|
||||
long content_size;
|
||||
int tn_size;
|
||||
float tn_qscale;
|
||||
} scan_media_ctx_t;
|
||||
|
||||
void parse_media(scan_media_ctx_t *ctx, vfile_t *f, document_t *doc);
|
||||
|
||||
#endif
|
142
third-party/libscan/libscan/ooxml/ooxml.c
vendored
142
third-party/libscan/libscan/ooxml/ooxml.c
vendored
@ -1,142 +0,0 @@
|
||||
#include "ooxml.h"
|
||||
|
||||
#include "../util.h"
|
||||
#include <archive.h>
|
||||
#include <archive_entry.h>
|
||||
#include <libxml/xmlstring.h>
|
||||
#include <libxml/parser.h>
|
||||
|
||||
__always_inline
|
||||
static int should_read_part(const char *part) {
|
||||
|
||||
// LOG_DEBUGF("ooxml.c", "Got part : %s", part)
|
||||
|
||||
if (part == NULL) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
if ( // Word
|
||||
STR_STARTS_WITH(part, "word/document.xml")
|
||||
|| STR_STARTS_WITH(part, "word/footnotes.xml")
|
||||
|| STR_STARTS_WITH(part, "word/endnotes.xml")
|
||||
|| STR_STARTS_WITH(part, "word/footer")
|
||||
|| STR_STARTS_WITH(part, "word/header")
|
||||
// PowerPoint
|
||||
|| STR_STARTS_WITH(part, "ppt/slides/slide")
|
||||
|| STR_STARTS_WITH(part, "ppt/notesSlides/slide")
|
||||
// Excel
|
||||
|| STR_STARTS_WITH(part, "xl/worksheets/sheet")
|
||||
|| STR_STARTS_WITH(part, "xl/sharedStrings.xml")
|
||||
|| STR_STARTS_WITH(part, "xl/workbook.xml")
|
||||
) {
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
int extract_text(xmlDoc *xml, xmlNode *node, text_buffer_t *buf) {
|
||||
//TODO: Check which nodes are likely to have a 't' child, and ignore nodes that aren't
|
||||
xmlErrorPtr err = xmlGetLastError();
|
||||
if (err != NULL) {
|
||||
if (err->level == XML_ERR_FATAL) {
|
||||
// LOG_ERRORF("ooxml.c", "Got fatal XML error while parsing document: %s", err->message)
|
||||
return -1;
|
||||
} else {
|
||||
// LOG_ERRORF("ooxml.c", "Got recoverable XML error while parsing document: %s", err->message)
|
||||
}
|
||||
}
|
||||
|
||||
for (xmlNode *child = node; child; child = child->next) {
|
||||
if (*child->name == 't' && *(child->name + 1) == '\0') {
|
||||
xmlChar *text = xmlNodeListGetString(xml, child->xmlChildrenNode, 1);
|
||||
|
||||
if (text) {
|
||||
text_buffer_append_string0(buf, (char *) text);
|
||||
text_buffer_append_char(buf, ' ');
|
||||
xmlFree(text);
|
||||
}
|
||||
}
|
||||
|
||||
extract_text(xml, child->children, buf);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int xml_io_read(void *context, char *buffer, int len) {
|
||||
struct archive *a = context;
|
||||
return archive_read_data(a, buffer, len);
|
||||
}
|
||||
|
||||
int xml_io_close(UNUSED(void *context)) {
|
||||
//noop
|
||||
return 0;
|
||||
}
|
||||
|
||||
__always_inline
|
||||
static int read_part(struct archive *a, text_buffer_t *buf, document_t *doc) {
|
||||
|
||||
xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, a, "/", NULL, XML_PARSE_RECOVER | XML_PARSE_NOWARNING | XML_PARSE_NOERROR | XML_PARSE_NONET);
|
||||
|
||||
if (xml == NULL) {
|
||||
// LOG_ERROR(doc->filepath, "Could not parse XML")
|
||||
return -1;
|
||||
}
|
||||
|
||||
xmlNode *root = xmlDocGetRootElement(xml);
|
||||
if (root == NULL) {
|
||||
// LOG_ERROR(doc->filepath, "Empty document")
|
||||
xmlFreeDoc(xml);
|
||||
return -1;
|
||||
}
|
||||
|
||||
extract_text(xml, root, buf);
|
||||
xmlFreeDoc(xml);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void parse_doc(scan_ooxml_cxt_t *ctx, vfile_t *f, document_t *doc) {
|
||||
|
||||
size_t buf_len;
|
||||
void * buf = read_all(f, &buf_len);
|
||||
|
||||
struct archive *a = archive_read_new();
|
||||
archive_read_support_format_zip(a);
|
||||
|
||||
int ret = archive_read_open_memory(a, buf, buf_len);
|
||||
if (ret != ARCHIVE_OK) {
|
||||
// LOG_ERRORF(doc->filepath, "Could not read archive: %s", archive_error_string(a))
|
||||
archive_read_free(a);
|
||||
return;
|
||||
}
|
||||
|
||||
text_buffer_t tex = text_buffer_create(ctx->content_size);
|
||||
|
||||
struct archive_entry *entry;
|
||||
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
|
||||
if (S_ISREG(archive_entry_stat(entry)->st_mode)) {
|
||||
const char *path = archive_entry_pathname(entry);
|
||||
|
||||
if (should_read_part(path)) {
|
||||
ret = read_part(a, &tex, doc);
|
||||
if (ret != 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (tex.dyn_buffer.cur > 0) {
|
||||
text_buffer_terminate_string(&tex);
|
||||
|
||||
meta_line_t *meta = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur);
|
||||
meta->key = MetaContent;
|
||||
strcpy(meta->str_val, tex.dyn_buffer.buf);
|
||||
APPEND_META(doc, meta)
|
||||
}
|
||||
|
||||
archive_read_close(a);
|
||||
archive_read_free(a);
|
||||
text_buffer_destroy(&tex);
|
||||
}
|
13
third-party/libscan/libscan/ooxml/ooxml.h
vendored
13
third-party/libscan/libscan/ooxml/ooxml.h
vendored
@ -1,13 +0,0 @@
|
||||
#ifndef SCAN_OOXML_H
|
||||
#define SCAN_OOXML_H
|
||||
|
||||
#include <stdlib.h>
|
||||
#include "../scan.h"
|
||||
|
||||
typedef struct {
|
||||
long content_size;
|
||||
} scan_ooxml_cxt_t;
|
||||
|
||||
void parse_doc(scan_ooxml_cxt_t *ctx, vfile_t *f, document_t *doc);
|
||||
|
||||
#endif
|
131
third-party/libscan/libscan/scan.h
vendored
131
third-party/libscan/libscan/scan.h
vendored
@ -1,131 +0,0 @@
|
||||
#ifndef SCAN_SCAN_H
|
||||
#define SCAN_SCAN_H
|
||||
|
||||
#include <stdio.h>
|
||||
#include <sys/stat.h>
|
||||
#include <uuid/uuid.h>
|
||||
|
||||
#include "macros.h"
|
||||
|
||||
|
||||
#define META_INT_MASK 0x80
|
||||
#define META_STR_MASK 0x40
|
||||
#define META_LONG_MASK 0x20
|
||||
|
||||
#define UNUSED(x) __attribute__((__unused__)) x
|
||||
|
||||
#define META_STR(id) ((unsigned) id) | ((unsigned) META_STR_MASK)
|
||||
#define META_INT(id) ((unsigned) id) | ((unsigned) META_INT_MASK)
|
||||
#define META_LONG(id) ((unsigned) id) | ((unsigned) META_LONG_MASK)
|
||||
|
||||
#define IS_META_INT(key) (key & META_INT_MASK) == META_INT_MASK
|
||||
#define IS_META_LONG(key) (key & META_LONG_MASK) == META_LONG_MASK
|
||||
#define IS_META_STR(meta) (meta->key & META_STR_MASK) == META_STR_MASK
|
||||
|
||||
|
||||
typedef int scan_code_t;
|
||||
#define SCAN_OK (scan_code_t) 0
|
||||
#define SCAN_ERR_READ (scan_code_t) -1
|
||||
|
||||
// This is written to file as a 16-bit int!
|
||||
enum metakey {
|
||||
MetaContent = META_STR(1),
|
||||
MetaWidth = META_INT(2),
|
||||
MetaHeight = META_INT(3),
|
||||
MetaMediaDuration = META_LONG(4),
|
||||
MetaMediaAudioCodec = META_INT(5),
|
||||
MetaMediaVideoCodec = META_INT(6),
|
||||
MetaMediaBitrate = META_LONG(7),
|
||||
MetaArtist = META_STR(8),
|
||||
MetaAlbum = META_STR(9),
|
||||
MetaAlbumArtist = META_STR(10),
|
||||
MetaGenre = META_STR(11),
|
||||
MetaTitle = META_STR(12),
|
||||
MetaFontName = META_STR(13),
|
||||
MetaParent = META_STR(14),
|
||||
MetaExifMake = META_STR(15),
|
||||
MetaExifSoftware = META_STR(16),
|
||||
MetaExifExposureTime = META_STR(17),
|
||||
MetaExifFNumber = META_STR(18),
|
||||
MetaExifFocalLength = META_STR(19),
|
||||
MetaExifUserComment = META_STR(20),
|
||||
MetaExifModel = META_STR(21),
|
||||
MetaExifIsoSpeedRatings = META_STR(22),
|
||||
MetaExifDateTime = META_STR(23),
|
||||
};
|
||||
|
||||
typedef struct meta_line {
|
||||
struct meta_line *next;
|
||||
enum metakey key;
|
||||
union {
|
||||
char str_val[0];
|
||||
int int_val;
|
||||
unsigned long long_val;
|
||||
};
|
||||
} meta_line_t;
|
||||
|
||||
|
||||
typedef struct document {
|
||||
unsigned char uuid[16];
|
||||
unsigned long ino;
|
||||
unsigned long size;
|
||||
unsigned int mime;
|
||||
int mtime;
|
||||
short base;
|
||||
short ext;
|
||||
meta_line_t *meta_head;
|
||||
meta_line_t *meta_tail;
|
||||
char *filepath;
|
||||
} document_t;
|
||||
|
||||
typedef struct vfile vfile_t;
|
||||
|
||||
__attribute__((warn_unused_result))
|
||||
typedef int (*read_func_t)(struct vfile *, void *buf, size_t size);
|
||||
|
||||
typedef void (*close_func_t)(struct vfile *);
|
||||
|
||||
typedef struct vfile {
|
||||
union {
|
||||
int fd;
|
||||
struct archive *arc;
|
||||
};
|
||||
|
||||
int is_fs_file;
|
||||
char *filepath;
|
||||
struct stat info;
|
||||
|
||||
read_func_t read;
|
||||
close_func_t close;
|
||||
} vfile_t;
|
||||
|
||||
typedef struct parse_job_t {
|
||||
int base;
|
||||
int ext;
|
||||
struct stat info;
|
||||
struct vfile vfile;
|
||||
uuid_t parent;
|
||||
char filepath[1];
|
||||
} parse_job_t;
|
||||
|
||||
|
||||
#define APPEND_META(doc, meta) \
|
||||
meta->next = NULL;\
|
||||
if (doc->meta_head == NULL) {\
|
||||
doc->meta_head = meta;\
|
||||
doc->meta_tail = doc->meta_head;\
|
||||
} else {\
|
||||
doc->meta_tail->next = meta;\
|
||||
doc->meta_tail = meta;\
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
#include "arc/arc.h"
|
||||
#include "cbr/cbr.h"
|
||||
#include "ebook/ebook.h"
|
||||
#include "font/font.h"
|
||||
#include "media/media.h"
|
||||
#include "ooxml/ooxml.h"
|
||||
#include "text/text.h"
|
31
third-party/libscan/libscan/text/text.c
vendored
31
third-party/libscan/libscan/text/text.c
vendored
@ -1,31 +0,0 @@
|
||||
#include "text.h"
|
||||
|
||||
scan_code_t parse_text(scan_text_ctx_t *ctx, struct vfile *f, document_t *doc) {
|
||||
|
||||
int to_read = MIN(ctx->content_size, doc->size);
|
||||
|
||||
char *buf = malloc(to_read);
|
||||
int ret = f->read(f, buf, to_read);
|
||||
if (ret < 0) {
|
||||
//TODO: log
|
||||
return SCAN_ERR_READ;
|
||||
}
|
||||
|
||||
text_buffer_t tex = text_buffer_create(ctx->content_size);
|
||||
text_buffer_append_string(&tex, buf, to_read);
|
||||
text_buffer_terminate_string(&tex);
|
||||
|
||||
meta_line_t *meta = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur);
|
||||
meta->key = MetaContent;
|
||||
strcpy(meta->str_val, tex.dyn_buffer.buf);
|
||||
|
||||
APPEND_META(doc, meta)
|
||||
|
||||
printf("%s", meta->str_val);
|
||||
|
||||
free(buf);
|
||||
text_buffer_destroy(&tex);
|
||||
|
||||
return SCAN_OK;
|
||||
}
|
||||
|
13
third-party/libscan/libscan/text/text.h
vendored
13
third-party/libscan/libscan/text/text.h
vendored
@ -1,13 +0,0 @@
|
||||
#ifndef SCAN_TEXT_H
|
||||
#define SCAN_TEXT_H
|
||||
|
||||
#include "../scan.h"
|
||||
#include "../util.h"
|
||||
|
||||
typedef struct {
|
||||
long content_size;
|
||||
} scan_text_ctx_t;
|
||||
|
||||
scan_code_t parse_text(scan_text_ctx_t *ctx, struct vfile *f, document_t *doc);
|
||||
|
||||
#endif
|
0
third-party/libscan/libscan/util.c
vendored
0
third-party/libscan/libscan/util.c
vendored
276
third-party/libscan/libscan/util.h
vendored
276
third-party/libscan/libscan/util.h
vendored
@ -1,276 +0,0 @@
|
||||
#ifndef SCAN_UTIL_H
|
||||
#define SCAN_UTIL_H
|
||||
|
||||
#include "stdio.h"
|
||||
#include "stdlib.h"
|
||||
#include "string.h"
|
||||
#include "../third-party/utf8.h/utf8.h"
|
||||
#include "macros.h"
|
||||
|
||||
#define STR_STARTS_WITH(x, y) (strncmp(y, x, sizeof(y) - 1) == 0)
|
||||
|
||||
#define TEXT_BUF_FULL -1
|
||||
#define INITIAL_BUF_SIZE 1024 * 16
|
||||
|
||||
#define SHOULD_IGNORE_CHAR(c) !(SHOULD_KEEP_CHAR(c))
|
||||
#define SHOULD_KEEP_CHAR(c) ((c >= '\'' && c <= ';') || (c >= 'A' && c <= 'z') || (c > 127))
|
||||
|
||||
|
||||
typedef struct dyn_buffer {
|
||||
char *buf;
|
||||
size_t cur;
|
||||
size_t size;
|
||||
} dyn_buffer_t;
|
||||
|
||||
typedef struct text_buffer {
|
||||
long max_size;
|
||||
int last_char_was_whitespace;
|
||||
dyn_buffer_t dyn_buffer;
|
||||
} text_buffer_t;
|
||||
|
||||
static int utf8_validchr2(const char *s) {
|
||||
if (0x00 == (0x80 & *s)) {
|
||||
return TRUE;
|
||||
} else if (0xf0 == (0xf8 & *s)) {
|
||||
if ((0x80 != (0xc0 & s[1])) || (0x80 != (0xc0 & s[2])) ||
|
||||
(0x80 != (0xc0 & s[3]))) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
if (0x80 == (0xc0 & s[4])) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
if ((0 == (0x07 & s[0])) && (0 == (0x30 & s[1]))) {
|
||||
return FALSE;
|
||||
}
|
||||
} else if (0xe0 == (0xf0 & *s)) {
|
||||
if ((0x80 != (0xc0 & s[1])) || (0x80 != (0xc0 & s[2]))) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
if (0x80 == (0xc0 & s[3])) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
if ((0 == (0x0f & s[0])) && (0 == (0x20 & s[1]))) {
|
||||
return FALSE;
|
||||
}
|
||||
} else if (0xc0 == (0xe0 & *s)) {
|
||||
if (0x80 != (0xc0 & s[1])) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
if (0x80 == (0xc0 & s[2])) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
if (0 == (0x1e & s[0])) {
|
||||
return FALSE;
|
||||
}
|
||||
} else {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
|
||||
static dyn_buffer_t dyn_buffer_create() {
|
||||
dyn_buffer_t buf;
|
||||
|
||||
buf.size = INITIAL_BUF_SIZE;
|
||||
buf.cur = 0;
|
||||
buf.buf = malloc(INITIAL_BUF_SIZE);
|
||||
|
||||
return buf;
|
||||
}
|
||||
|
||||
static void grow_buffer(dyn_buffer_t *buf, size_t size) {
|
||||
if (buf->cur + size > buf->size) {
|
||||
do {
|
||||
buf->size *= 2;
|
||||
} while (buf->cur + size > buf->size);
|
||||
|
||||
buf->buf = realloc(buf->buf, buf->size);
|
||||
}
|
||||
}
|
||||
|
||||
static void grow_buffer_small(dyn_buffer_t *buf) {
|
||||
if (buf->cur + sizeof(long) > buf->size) {
|
||||
buf->size *= 2;
|
||||
buf->buf = realloc(buf->buf, buf->size);
|
||||
}
|
||||
}
|
||||
|
||||
static void dyn_buffer_write(dyn_buffer_t *buf, const void *data, size_t size) {
|
||||
grow_buffer(buf, size);
|
||||
|
||||
memcpy(buf->buf + buf->cur, data, size);
|
||||
buf->cur += size;
|
||||
}
|
||||
|
||||
static void dyn_buffer_write_char(dyn_buffer_t *buf, char c) {
|
||||
grow_buffer_small(buf);
|
||||
|
||||
*(buf->buf + buf->cur) = c;
|
||||
buf->cur += sizeof(c);
|
||||
}
|
||||
|
||||
static void dyn_buffer_write_str(dyn_buffer_t *buf, char *str) {
|
||||
dyn_buffer_write(buf, str, strlen(str));
|
||||
dyn_buffer_write_char(buf, '\0');
|
||||
}
|
||||
|
||||
static void dyn_buffer_append_string(dyn_buffer_t *buf, char *str) {
|
||||
dyn_buffer_write(buf, str, strlen(str));
|
||||
}
|
||||
|
||||
static void dyn_buffer_write_int(dyn_buffer_t *buf, int d) {
|
||||
grow_buffer_small(buf);
|
||||
|
||||
*(int *) (buf->buf + buf->cur) = d;
|
||||
buf->cur += sizeof(int);
|
||||
}
|
||||
|
||||
static void dyn_buffer_write_short(dyn_buffer_t *buf, short s) {
|
||||
grow_buffer_small(buf);
|
||||
|
||||
*(short *) (buf->buf + buf->cur) = s;
|
||||
buf->cur += sizeof(short);
|
||||
}
|
||||
|
||||
static void dyn_buffer_write_long(dyn_buffer_t *buf, unsigned long l) {
|
||||
grow_buffer_small(buf);
|
||||
|
||||
*(unsigned long *) (buf->buf + buf->cur) = l;
|
||||
buf->cur += sizeof(unsigned long);
|
||||
}
|
||||
|
||||
static void dyn_buffer_destroy(dyn_buffer_t *buf) {
|
||||
free(buf->buf);
|
||||
}
|
||||
|
||||
static void text_buffer_destroy(text_buffer_t *buf) {
|
||||
dyn_buffer_destroy(&buf->dyn_buffer);
|
||||
}
|
||||
|
||||
static text_buffer_t text_buffer_create(long max_size) {
|
||||
text_buffer_t text_buf;
|
||||
|
||||
text_buf.dyn_buffer = dyn_buffer_create();
|
||||
text_buf.max_size = max_size;
|
||||
text_buf.last_char_was_whitespace = FALSE;
|
||||
|
||||
return text_buf;
|
||||
}
|
||||
|
||||
static int text_buffer_append_char(text_buffer_t *buf, int c) {
|
||||
|
||||
if (SHOULD_IGNORE_CHAR(c) || c == ' ') {
|
||||
if (!buf->last_char_was_whitespace && buf->dyn_buffer.cur != 0) {
|
||||
dyn_buffer_write_char(&buf->dyn_buffer, ' ');
|
||||
buf->last_char_was_whitespace = TRUE;
|
||||
|
||||
if (buf->max_size > 0 && buf->dyn_buffer.cur >= buf->max_size) {
|
||||
return TEXT_BUF_FULL;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
buf->last_char_was_whitespace = FALSE;
|
||||
grow_buffer_small(&buf->dyn_buffer);
|
||||
|
||||
if (((utf8_int32_t) 0xffffff80 & c) == 0) {
|
||||
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = (char) c;
|
||||
} else if (((utf8_int32_t) 0xfffff800 & c) == 0) {
|
||||
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0xc0 | (char) (c >> 6);
|
||||
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) (c & 0x3f);
|
||||
} else if (((utf8_int32_t) 0xffff0000 & c) == 0) {
|
||||
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0xe0 | (char) (c >> 12);
|
||||
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) ((c >> 6) & 0x3f);
|
||||
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) (c & 0x3f);
|
||||
} else {
|
||||
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0xf0 | (char) (c >> 18);
|
||||
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) ((c >> 12) & 0x3f);
|
||||
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) ((c >> 6) & 0x3f);
|
||||
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) (c & 0x3f);
|
||||
}
|
||||
|
||||
if (buf->max_size > 0 && buf->dyn_buffer.cur >= buf->max_size) {
|
||||
return TEXT_BUF_FULL;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static void text_buffer_terminate_string(text_buffer_t *buf) {
|
||||
if (buf->dyn_buffer.cur > 0 && *(buf->dyn_buffer.buf + buf->dyn_buffer.cur - 1) == ' ') {
|
||||
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur - 1) = '\0';
|
||||
} else {
|
||||
dyn_buffer_write_char(&buf->dyn_buffer, '\0');
|
||||
}
|
||||
}
|
||||
|
||||
#define UTF8_END_OF_STRING \
|
||||
(ptr - str >= len || *ptr == 0 || \
|
||||
(0xc0 == (0xe0 & *ptr) && ptr - str > len - 2) || \
|
||||
(0xe0 == (0xf0 & *ptr) && ptr - str > len - 3) || \
|
||||
(0xf0 == (0xf8 & *ptr) && ptr - str > len - 4))
|
||||
|
||||
static int text_buffer_append_string(text_buffer_t *buf, const char *str, size_t len) {
|
||||
|
||||
const char *ptr = str;
|
||||
const char *oldPtr = ptr;
|
||||
|
||||
if (str == NULL || UTF8_END_OF_STRING) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (len <= 4) {
|
||||
for (int i = 0; i < len; i++) {
|
||||
if (((utf8_int32_t)0xffffff80 & str[i]) == 0) {
|
||||
dyn_buffer_write_char(&buf->dyn_buffer, str[i]);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
utf8_int32_t c;
|
||||
char tmp[16];
|
||||
|
||||
do {
|
||||
ptr = utf8codepoint(ptr, &c);
|
||||
*(int *) tmp = 0x00000000;
|
||||
memcpy(tmp, oldPtr, ptr - oldPtr);
|
||||
oldPtr = ptr;
|
||||
|
||||
if (!utf8_validchr2(tmp)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
int ret = text_buffer_append_char(buf, c);
|
||||
|
||||
if (ret != 0) {
|
||||
return ret;
|
||||
}
|
||||
} while (!UTF8_END_OF_STRING);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int text_buffer_append_string0(text_buffer_t *buf, char *str) {
|
||||
return text_buffer_append_string(buf, str, strlen(str));
|
||||
}
|
||||
|
||||
static void* read_all(vfile_t *f, size_t *size) {
|
||||
void* buf = malloc(f->info.st_size);
|
||||
*size = f->read(f, buf, f->info.st_size);
|
||||
|
||||
//TODO: log
|
||||
|
||||
return buf;
|
||||
}
|
||||
|
||||
#endif
|
1
third-party/onion
vendored
1
third-party/onion
vendored
@ -1 +0,0 @@
|
||||
Subproject commit 2b3b230b79ecae119b7eb847f2f9545a46bef13c
|
1
third-party/utf8.h
vendored
1
third-party/utf8.h
vendored
@ -1 +0,0 @@
|
||||
Subproject commit b686b0c5181c2dd9f8297e6ac3692c9614b083be
|
Loading…
x
Reference in New Issue
Block a user