Initial commit (squashed)

This commit is contained in:
2019-09-27 22:56:07 -04:00
commit 564a17a8fa
75 changed files with 7518 additions and 0 deletions

211
src/parsing/font.c Normal file
View File

@@ -0,0 +1,211 @@
#include "font.h"
#include "ft2build.h"
#include "freetype/freetype.h"
#include "src/ctx.h"
__thread FT_Library library = NULL;
typedef struct text_dimensions {
unsigned int width;
unsigned int height;
unsigned int baseline;
} text_dimensions_t;
typedef struct glyph {
unsigned int top;
unsigned int height;
unsigned int width;
unsigned int descent;
unsigned int ascent;
unsigned int advance_width;
unsigned char *pixmap;
} glyph_t;
__always_inline
int kerning_offset(char c, char pc, FT_Face face) {
FT_Vector kerning;
FT_Get_Kerning(face, c, pc, FT_KERNING_DEFAULT, &kerning);
return (int) (kerning.x / 64);
}
__always_inline
glyph_t ft_glyph_to_glyph(FT_GlyphSlot slot) {
glyph_t glyph;
glyph.pixmap = slot->bitmap.buffer;
glyph.width = slot->bitmap.width;
glyph.height = slot->bitmap.rows;
glyph.top = slot->bitmap_top;
glyph.advance_width = slot->advance.x / 64;
glyph.descent = MAX(0, glyph.height - glyph.top);
glyph.ascent = MAX(0, MAX(glyph.top, glyph.height) - glyph.descent);
return glyph;
}
__always_inline
glyph_t get_glyph(char character, FT_Face face) {
}
text_dimensions_t text_dimension(char *text, FT_Face face) {
text_dimensions_t dimensions;
dimensions.width = 0;
int num_chars = (int) strlen(text);
unsigned int max_ascent = 0;
unsigned int max_descent = 0;
char pc = 0;
for (int i = 0; i < num_chars; i++) {
char c = text[i];
FT_Load_Char(face, c, 0);
glyph_t glyph = ft_glyph_to_glyph(face->glyph);
max_descent = MAX(max_descent, glyph.descent);
max_ascent = MAX(max_ascent, glyph.ascent);
int kerning_x = kerning_offset(c, pc, face);
dimensions.width += MAX(glyph.advance_width, glyph.width) + kerning_x;
pc = c;
}
dimensions.height = max_ascent + max_descent;
dimensions.baseline = max_descent;
return dimensions;
}
void draw_glyph(glyph_t *glyph, int x, int y, struct text_dimensions text_info, unsigned char *bitmap) {
unsigned int src = 0;
unsigned int dst = y * text_info.width + x;
unsigned int row_offset = text_info.width - glyph->width;
unsigned int buf_len = text_info.width * text_info.height;
for (unsigned int sy = 0; sy < glyph->height; sy++) {
for (unsigned int sx = 0; sx < glyph->width; sx++) {
if (dst < buf_len) {
bitmap[dst] |= glyph->pixmap[src];
}
src++;
dst++;
}
dst += row_offset;
}
}
void bmp_format(dyn_buffer_t *buf, text_dimensions_t dimensions, const unsigned char *bitmap) {
dyn_buffer_write_short(buf, 0x4D42); // Magic
dyn_buffer_write_int(buf, 0); // Size placeholder
dyn_buffer_write_int(buf, 0x5157); //Reserved
dyn_buffer_write_int(buf, 14 + 40 + 256 * 4); // pixels offset
dyn_buffer_write_int(buf, 40); // DIB size
dyn_buffer_write_int(buf, (int) dimensions.width);
dyn_buffer_write_int(buf, (int) dimensions.height);
dyn_buffer_write_short(buf, 1); // Color planes
dyn_buffer_write_short(buf, 8); // bits per pixel
dyn_buffer_write_int(buf, 0); // compression
dyn_buffer_write_int(buf, 0); // Ignored
dyn_buffer_write_int(buf, 3800); // hres
dyn_buffer_write_int(buf, 3800); // vres
dyn_buffer_write_int(buf, 256); // Color count
dyn_buffer_write_int(buf, 0); // Ignored
// RGBA32 Color table (Grayscale)
for (int i = 255; i >= 0; i--) {
dyn_buffer_write_int(buf, i + (i << 8) + (i << 16));
}
// Pixel array: write from bottom to top, with rows padded to multiples of 4-bytes
for (int y = (int) dimensions.height - 1; y >= 0; y--) {
for (unsigned int x = 0; x < dimensions.width; x++) {
dyn_buffer_write_char(buf, (char) bitmap[y * dimensions.width + x]);
}
while (buf->cur % 4 != 0) {
dyn_buffer_write_char(buf, 0);
}
}
// Size
*(int *) ((char *) buf->buf + 2) = buf->cur;
}
void parse_font(const char *buf, size_t buf_len, document_t *doc) {
if (library == NULL) {
FT_Init_FreeType(&library);
}
FT_Face face;
FT_Error err = FT_New_Memory_Face(library, (unsigned char *) buf, buf_len, 0, &face);
if (err != 0) {
return;
}
char font_name[1024];
if (face->style_name == NULL || *(face->style_name) == '?') {
strcpy(font_name, face->family_name);
} else {
snprintf(font_name, sizeof(font_name), "%s %s", face->family_name, face->style_name);
}
meta_line_t *meta_name = malloc(sizeof(meta_line_t) + strlen(font_name));
meta_name->key = MetaFontName;
strcpy(meta_name->strval, font_name);
APPEND_META(doc, meta_name)
int pixel = 64;
int num_chars = (int) strlen(font_name);
err = FT_Set_Pixel_Sizes(face, 0, pixel);
if (err != 0) {
return;
}
text_dimensions_t dimensions = text_dimension(font_name, face);
unsigned char *bitmap = calloc(dimensions.width * dimensions.height, 1);
FT_Vector pen;
pen.x = 0;
char pc = 0;
for (int i = 0; i < num_chars; i++) {
char c = font_name[i];
err = FT_Load_Char(face, c, FT_LOAD_NO_HINTING | FT_LOAD_RENDER);
if (err != 0) {
continue;
}
glyph_t glyph = ft_glyph_to_glyph(face->glyph);
pen.x += kerning_offset(c, pc, face);
pen.y = dimensions.height - glyph.ascent - dimensions.baseline;
draw_glyph(&glyph, pen.x, pen.y, dimensions, bitmap);
pen.x += glyph.advance_width;
pc = c;
}
dyn_buffer_t bmp_data = dyn_buffer_create();
bmp_format(&bmp_data, dimensions, bitmap);
store_write(ScanCtx.index.store, (char *) doc->uuid, sizeof(doc->uuid), (char *) bmp_data.buf, bmp_data.cur);
dyn_buffer_destroy(&bmp_data);
free(bitmap);
FT_Done_Face(face);
}

9
src/parsing/font.h Normal file
View File

@@ -0,0 +1,9 @@
#ifndef SIST2_FONT_H
#define SIST2_FONT_H
#include "src/sist.h"
void parse_font(const char * buf, size_t buf_len, document_t *doc);
#endif

269
src/parsing/media.c Normal file
View File

@@ -0,0 +1,269 @@
#include "src/sist.h"
#include "src/ctx.h"
AVCodecContext *alloc_jpeg_encoder(int dstW, int dstH, float qscale) {
AVCodec *jpeg_codec = avcodec_find_encoder(AV_CODEC_ID_MJPEG);
AVCodecContext *jpeg = avcodec_alloc_context3(jpeg_codec);
jpeg->width = dstW;
jpeg->height = dstH;
jpeg->time_base.den = 1000000;
jpeg->time_base.num = 1;
jpeg->i_quant_factor = qscale;
jpeg->pix_fmt = AV_PIX_FMT_YUVJ420P;
int ret = avcodec_open2(jpeg, jpeg_codec, NULL);
if (ret != 0) {
printf("Could not open jpeg encoder: %s!\n", av_err2str(ret));
return NULL;
}
return jpeg;
}
AVFrame *scale_frame(const AVCodecContext *decoder, const AVFrame *frame, int size) {
AVFrame *scaled_frame = av_frame_alloc();
int dstW;
int dstH;
if (frame->width <= size && frame->height <= size) {
dstW = frame->width;
dstH = frame->height;
} else {
double ratio = (double) frame->width / frame->height;
if (frame->width > frame->height) {
dstW = size;
dstH = (int) (size / ratio);
} else {
dstW = (int) (size * ratio);
dstH = size;
}
}
struct SwsContext *ctx = sws_getContext(
decoder->width, decoder->height, decoder->pix_fmt,
dstW, dstH, AV_PIX_FMT_YUVJ420P,
SWS_FAST_BILINEAR, 0, 0, 0
);
int dst_buf_len = avpicture_get_size(AV_PIX_FMT_YUVJ420P, dstW, dstH);
uint8_t *dst_buf = (uint8_t *) av_malloc(dst_buf_len);
avpicture_fill((AVPicture *) scaled_frame, dst_buf, AV_PIX_FMT_YUVJ420P, dstW, dstH);
sws_scale(ctx,
(const uint8_t *const *) frame->data, frame->linesize,
0, decoder->height,
scaled_frame->data, scaled_frame->linesize
);
scaled_frame->width = dstW;
scaled_frame->height = dstH;
scaled_frame->format = AV_PIX_FMT_YUV420P;
sws_freeContext(ctx);
return scaled_frame;
}
AVFrame *read_frame(AVFormatContext *pFormatCtx, AVCodecContext *decoder, int stream_idx) {
AVFrame *frame = av_frame_alloc();
AVPacket avPacket;
av_init_packet(&avPacket);
int receive_ret = -EAGAIN;
while (receive_ret == -EAGAIN) {
// Get video frame
while (1) {
int read_frame_ret = av_read_frame(pFormatCtx, &avPacket);
if (read_frame_ret != 0) {
if (read_frame_ret != AVERROR_EOF) {
fprintf(stderr, "Error reading frame: %s\n", av_err2str(read_frame_ret));
}
av_frame_free(&frame);
av_packet_unref(&avPacket);
return NULL;
}
//Ignore audio/other frames
if (avPacket.stream_index != stream_idx) {
av_packet_unref(&avPacket);
continue;
}
break;
}
// Feed it to decoder
int decode_ret = avcodec_send_packet(decoder, &avPacket);
if (decode_ret != 0) {
printf("Error decoding frame: %s\n", av_err2str(decode_ret));
}
av_packet_unref(&avPacket);
receive_ret = avcodec_receive_frame(decoder, frame);
}
return frame;
}
void append_audio_meta(AVFormatContext *pFormatCtx, document_t *doc) {
AVDictionaryEntry *tag = NULL;
while ((tag = av_dict_get(pFormatCtx->metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) {
char *key = tag->key;
for (; *key; ++key) *key = (char) tolower(*key);
if (strcmp(tag->key, "artist") == 0) {
size_t len = strlen(tag->value);
meta_line_t *meta_tag = malloc(sizeof(meta_line_t) + len);
meta_tag->key = MetaArtist;
memcpy(meta_tag->strval, tag->value, len);
APPEND_META(doc, meta_tag)
} else if (strcmp(tag->key, "genre") == 0) {
size_t len = strlen(tag->value);
meta_line_t *meta_tag = malloc(sizeof(meta_line_t) + len);
meta_tag->key = MetaGenre;
memcpy(meta_tag->strval, tag->value, len);
APPEND_META(doc, meta_tag)
} else if (strcmp(tag->key, "title") == 0) {
size_t len = strlen(tag->value);
meta_line_t *meta_tag = malloc(sizeof(meta_line_t) + len);
meta_tag->key = MetaTitle;
memcpy(meta_tag->strval, tag->value, len);
APPEND_META(doc, meta_tag)
} else if (strcmp(tag->key, "album_artist") == 0) {
size_t len = strlen(tag->value);
meta_line_t *meta_tag = malloc(sizeof(meta_line_t) + len);
meta_tag->key = MetaAlbumArtist;
memcpy(meta_tag->strval, tag->value, len);
APPEND_META(doc, meta_tag)
} else if (strcmp(tag->key, "album") == 0) {
size_t len = strlen(tag->value);
meta_line_t *meta_tag = malloc(sizeof(meta_line_t) + len);
meta_tag->key = MetaAlbum;
memcpy(meta_tag->strval, tag->value, len);
APPEND_META(doc, meta_tag)
}
}
}
void parse_media(const char *filepath, document_t *doc) {
int video_stream = -1;
AVFormatContext *pFormatCtx = avformat_alloc_context();
int res = avformat_open_input(&pFormatCtx, filepath, NULL, NULL);
if (res < 0) {
printf("ERR%s %s\n", filepath, av_err2str(res));
return;
}
avformat_find_stream_info(pFormatCtx, NULL);
for (int i = (int) pFormatCtx->nb_streams - 1; i >= 0; i--) {
AVStream *stream = pFormatCtx->streams[i];
if (stream->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
meta_line_t *meta_audio = malloc(sizeof(meta_line_t));
meta_audio->key = MetaMediaAudioCodec;
meta_audio->intval = stream->codecpar->codec_id;
APPEND_META(doc, meta_audio)
append_audio_meta(pFormatCtx, doc);
} else if (stream->codecpar->codec_type == AVMEDIA_TYPE_VIDEO) {
meta_line_t *meta_vid = malloc(sizeof(meta_line_t));
meta_vid->key = MetaMediaVideoCodec;
meta_vid->intval = stream->codecpar->codec_id;
APPEND_META(doc, meta_vid)
meta_line_t *meta_w = malloc(sizeof(meta_line_t));
meta_w->key = MetaWidth;
meta_w->intval = stream->codecpar->width;
APPEND_META(doc, meta_w)
meta_line_t *meta_h = malloc(sizeof(meta_line_t));
meta_h->key = MetaHeight;
meta_h->intval = stream->codecpar->height;
APPEND_META(doc, meta_h)
video_stream = i;
}
}
if (video_stream != -1) {
AVStream *stream = pFormatCtx->streams[video_stream];
if (stream->nb_frames > 1) {
//This is a video (not a still image)
meta_line_t *meta_duration = malloc(sizeof(meta_line_t));
meta_duration->key = MetaMediaDuration;
meta_duration->longval = pFormatCtx->duration / AV_TIME_BASE;
APPEND_META(doc, meta_duration)
meta_line_t *meta_bitrate = malloc(sizeof(meta_line_t));
meta_bitrate->key = MetaMediaBitrate;
meta_bitrate->intval = pFormatCtx->bit_rate;
APPEND_META(doc, meta_bitrate)
}
if (stream->codecpar->width <= 20 || stream->codecpar->height <= 20) {
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
return;
}
// Decoder
AVCodec *video_codec = avcodec_find_decoder(stream->codecpar->codec_id);
AVCodecContext *decoder = avcodec_alloc_context3(video_codec);
avcodec_parameters_to_context(decoder, stream->codecpar);
avcodec_open2(decoder, video_codec, NULL);
//Seek
if (stream->nb_frames > 1 && stream->codecpar->codec_id != AV_CODEC_ID_GIF) {
int seek_ret = 0;
for (int i = 20; i >= 0; i--) {
seek_ret = av_seek_frame(pFormatCtx, video_stream,
stream->duration * 0.10, 0);
if (seek_ret == 0) {
break;
}
}
}
AVFrame *frame = read_frame(pFormatCtx, decoder, video_stream);
if (frame == NULL) {
avcodec_free_context(&decoder);
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
return;
}
// Scale frame
AVFrame *scaled_frame = scale_frame(decoder, frame, ScanCtx.tn_size);
// Encode frame to jpeg
AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(scaled_frame->width, scaled_frame->height, ScanCtx.tn_qscale);
avcodec_send_frame(jpeg_encoder, scaled_frame);
AVPacket jpeg_packet;
av_init_packet(&jpeg_packet);
avcodec_receive_packet(jpeg_encoder, &jpeg_packet);
// Save thumbnail
store_write(ScanCtx.index.store, (char *) doc->uuid, sizeof(doc->uuid), (char *) jpeg_packet.data, jpeg_packet.size);
av_packet_unref(&jpeg_packet);
av_frame_free(&frame);
av_free(*scaled_frame->data);
av_frame_free(&scaled_frame);
avcodec_free_context(&jpeg_encoder);
avcodec_free_context(&decoder);
}
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
}

11
src/parsing/media.h Normal file
View File

@@ -0,0 +1,11 @@
#ifndef SIST2_MEDIA_H
#define SIST2_MEDIA_H
#include "src/sist.h"
#define MIN_VIDEO_SIZE 1024 * 64
void parse_media(const char * filepath, document_t *doc);
#endif

20
src/parsing/mime.c Normal file
View File

@@ -0,0 +1,20 @@
#include "mime.h"
unsigned int mime_get_mime_by_ext(GHashTable *ext_table, const char * ext) {
char lower[64];
char *p = lower;
while ((*ext)) {
*p++ = (char)tolower(*ext++);
}
*p = '\0';
return (size_t) g_hash_table_lookup(ext_table, lower);
}
unsigned int mime_get_mime_by_string(GHashTable *mime_table, const char * str) {
const char * ptr = str;
while (*ptr == ' ' || *ptr == '[') {
ptr++;
}
return (size_t) g_hash_table_lookup(mime_table, ptr);
}

45
src/parsing/mime.h Normal file
View File

@@ -0,0 +1,45 @@
#ifndef SIST2_MIME_H
#define SIST2_MIME_H
#include "src/sist.h"
#define MAJOR_MIME(mime_id) (mime_id & 0x0FFF0000) >> 16
#define MIME_EMPTY 1
#define DONT_PARSE 0x80000000
#define SHOULD_PARSE(mime_id) (mime_id & DONT_PARSE) != DONT_PARSE
#define PDF_MASK 0x40000000
#define IS_PDF(mime_id) (mime_id & PDF_MASK) == PDF_MASK
#define FONT_MASK 0x20000000
#define IS_FONT(mime_id) (mime_id & FONT_MASK) == FONT_MASK
enum major_mime {
MimeInvalid = 0,
MimeModel = 1,
MimeExample = 2,
MimeMessage = 3,
MimeMultipart = 4,
MimeFont = 5,
MimeVideo = 6,
MimeAudio = 7,
MimeImage = 8,
MimeText = 9,
MimeApplication = 10,
};
enum mime;
GHashTable *mime_get_mime_table();
GHashTable *mime_get_ext_table();
char *mime_get_mime_text(unsigned int);
unsigned int mime_get_mime_by_ext(GHashTable *ext_table, const char * ext);
unsigned int mime_get_mime_by_string(GHashTable *mime_table, const char * str);
#endif

1552
src/parsing/mime_generated.c Normal file

File diff suppressed because it is too large Load Diff

126
src/parsing/parse.c Normal file
View File

@@ -0,0 +1,126 @@
#include "src/sist.h"
#include "src/ctx.h"
__thread magic_t Magic;
void *read_all(parse_job_t *job, const char *buf, int bytes_read, int *fd) {
void *full_buf;
if (job->info.st_size <= bytes_read) {
full_buf = malloc(job->info.st_size);
memcpy(full_buf, buf, job->info.st_size);
} else {
if (*fd == -1) {
*fd = open(job->filepath, O_RDONLY);
if (*fd == -1) {
perror("open");
printf("%s\n", job->filepath);
free(job);
return NULL;
}
}
full_buf = malloc(job->info.st_size);
memcpy(full_buf, buf, bytes_read);
int ret = read(*fd, full_buf + bytes_read, job->info.st_size - bytes_read);
if (ret == -1) {
perror("read");
}
}
return full_buf;
}
void parse(void *arg) {
parse_job_t *job = arg;
document_t doc;
if (incremental_get(ScanCtx.original_table, job->info.st_ino) == job->info.st_mtim.tv_sec) {
incremental_mark_file_for_copy(ScanCtx.copy_table, job->info.st_ino);
free(job);
return;
}
if (Magic == NULL) {
Magic = magic_open(MAGIC_MIME_TYPE);
magic_load(Magic, NULL);
}
doc.filepath = job->filepath;
doc.ext = (short) job->ext;
doc.base = (short) job->base;
doc.meta_head = NULL;
doc.meta_tail = NULL;
doc.mime = 0;
doc.size = job->info.st_size;
doc.ino = job->info.st_ino;
doc.mtime = job->info.st_mtim.tv_sec;
uuid_generate_time_safe(doc.uuid);
char *buf[PARSE_BUF_SIZE];
if (job->info.st_size == 0) {
doc.mime = MIME_EMPTY;
} else if (*(job->filepath + job->ext) != '\0') {
doc.mime = mime_get_mime_by_ext(ScanCtx.ext_table, job->filepath + job->ext);
}
int fd = -1;
int bytes_read = 0;
if (doc.mime == 0) {
// Get mime type with libmagic
fd = open(job->filepath, O_RDONLY);
if (fd == -1) {
perror("open");
free(job);
return;
}
bytes_read = read(fd, buf, PARSE_BUF_SIZE);
const char *magic_mime_str = magic_buffer(Magic, buf, bytes_read);
if (magic_mime_str != NULL) {
doc.mime = mime_get_mime_by_string(ScanCtx.mime_table, magic_mime_str);
if (doc.mime == 0) {
fprintf(stderr, "Couldn't find mime %s, %s!\n", magic_mime_str, job->filepath + job->base);
}
}
}
int mmime = MAJOR_MIME(doc.mime);
if (!(SHOULD_PARSE(doc.mime))) {
} else if ((mmime == MimeVideo && doc.size >= MIN_VIDEO_SIZE) || mmime == MimeAudio || mmime == MimeImage) {
parse_media(job->filepath, &doc);
} else if (IS_PDF(doc.mime)) {
void *pdf_buf = read_all(job, (char *) buf, bytes_read, &fd);
parse_pdf(pdf_buf, doc.size, &doc);
if (pdf_buf != buf) {
free(pdf_buf);
}
} else if (mmime == MimeText && ScanCtx.content_size > 0) {
parse_text(bytes_read, &fd, (char *) buf, &doc);
} else if (IS_FONT(doc.mime)) {
void *font_buf = read_all(job, (char *) buf, bytes_read, &fd);
parse_font(font_buf, doc.size, &doc);
if (font_buf != buf) {
free(font_buf);
}
}
write_document(&doc);
if (fd != -1) {
close(fd);
}
free(job);
}

10
src/parsing/parse.h Normal file
View File

@@ -0,0 +1,10 @@
#ifndef SIST2_PARSE_H
#define SIST2_PARSE_H
#include "src/sist.h"
#define PARSE_BUF_SIZE 4096
void parse(void *arg);
#endif

114
src/parsing/pdf.c Normal file
View File

@@ -0,0 +1,114 @@
#include "pdf.h"
#include "src/ctx.h"
fz_page *render_cover(fz_context *ctx, document_t *doc, fz_document *fzdoc) {
fz_page *cover = fz_load_page(ctx, fzdoc, 0);
fz_rect bounds = fz_bound_page(ctx, cover);
float scale;
unsigned char *tn_buf;
float w = (float) bounds.x1 - bounds.x0;
float h = (float) bounds.y1 - bounds.y0;
if (w > h) {
scale = (float) ScanCtx.tn_size / w;
} else {
scale = (float) ScanCtx.tn_size / h;
}
fz_matrix m = fz_scale(scale, scale);
fz_pixmap *pixmap;
fz_colorspace *color_space = fz_device_rgb(ctx);
pixmap = fz_new_pixmap_from_page(ctx, cover, m, color_space, 0);
fz_buffer *fzbuf = fz_new_buffer_from_pixmap_as_png(ctx, pixmap, fz_default_color_params);
size_t tn_len = fz_buffer_storage(ctx, fzbuf, &tn_buf);
store_write(ScanCtx.index.store, (char *) doc->uuid, sizeof(doc->uuid), (char *) tn_buf, tn_len);
fz_drop_pixmap(ctx, pixmap);
fz_drop_buffer(ctx, fzbuf);
return cover;
}
void fz_noop_callback(void *user, const char *message) {
}
void parse_pdf(void *buf, size_t buf_len, document_t *doc) {
//TODO error handling
fz_context *ctx = fz_new_context(NULL, NULL, FZ_STORE_UNLIMITED);
fz_try(ctx)
{
fz_register_document_handlers(ctx);
ctx->warn.print = fz_noop_callback; //disable warnings
ctx->error.print = fz_noop_callback;
fz_stream *stream = fz_open_memory(ctx, buf, buf_len);
fz_document *fzdoc = fz_open_document_with_stream(ctx, mime_get_mime_text(doc->mime), stream);
int page_count = fz_count_pages(ctx, fzdoc);
fz_page *cover = render_cover(ctx, doc, fzdoc);
fz_stext_options opts;
fz_parse_stext_options(ctx, &opts, "preserve-ligatures");
text_buffer_t text_buf = text_buffer_create(ScanCtx.content_size);
for (int current_page = 0; current_page < page_count; current_page++) {
fz_page *page;
if (current_page == 0) {
page = cover;
} else {
page = fz_load_page(ctx, fzdoc, current_page);
}
fz_stext_page *stext = fz_new_stext_page_from_page(ctx, page, &opts);
fz_stext_block *block = stext->first_block;
while (block != NULL) {
if (block->type != FZ_STEXT_BLOCK_TEXT) {
continue;
}
fz_stext_line *line = block->u.t.first_line;
while (line != NULL) {
fz_stext_char *c = line->first_char;
while (c != NULL) {
if (text_buffer_append_char(&text_buf, c->c) == TEXT_BUF_FULL) {
fz_drop_page(ctx, page);
fz_drop_stext_page(ctx, stext);
goto write_loop_end;
}
c = c->next;
}
line = line->next;
}
block = block->next;
}
fz_drop_page(ctx, page);
fz_drop_stext_page(ctx, stext);
}
write_loop_end:;
text_buffer_terminate_string(&text_buf);
meta_line_t *meta_content = malloc(sizeof(meta_line_t) + text_buf.dyn_buffer.cur);
meta_content->key = MetaContent;
memcpy(meta_content->strval, text_buf.dyn_buffer.buf, text_buf.dyn_buffer.cur);
text_buffer_destroy(&text_buf);
APPEND_META(doc, meta_content)
fz_drop_stream(ctx, stream);
fz_drop_document(ctx, fzdoc);
fz_drop_context(ctx);
}
fz_catch(ctx)
{
// printf("err");
}
}

9
src/parsing/pdf.h Normal file
View File

@@ -0,0 +1,9 @@
#ifndef SIST2_PDF_H
#define SIST2_PDF_H
#include "src/sist.h"
void parse_pdf(void *buf, size_t buf_len, document_t *doc);
#endif

43
src/parsing/text.c Normal file
View File

@@ -0,0 +1,43 @@
#include "text.h"
#include "src/ctx.h"
void parse_text(int bytes_read, int *fd, char *buf, document_t *doc) {
char *intermediate_buf;
int intermediate_buf_len;
if (bytes_read == doc->size || bytes_read >= ScanCtx.content_size) {
int to_copy = MIN(bytes_read, ScanCtx.content_size);
intermediate_buf = malloc(to_copy);
intermediate_buf_len = to_copy;
memcpy(intermediate_buf, buf, to_copy);
} else {
if (*fd == -1) {
*fd = open(doc->filepath, O_RDONLY);
}
int to_read = MIN(ScanCtx.content_size, doc->size) - bytes_read;
intermediate_buf = malloc(to_read + bytes_read);
intermediate_buf_len = to_read + bytes_read;
if (bytes_read != 0) {
memcpy(intermediate_buf, buf, bytes_read);
}
read(*fd, intermediate_buf + bytes_read, to_read);
}
text_buffer_t text_buf = text_buffer_create(ScanCtx.content_size);
for (int i = 0; i < intermediate_buf_len; i++) {
text_buffer_append_char(&text_buf, *(intermediate_buf + i));
}
text_buffer_terminate_string(&text_buf);
meta_line_t *meta = malloc(sizeof(meta_line_t) + text_buf.dyn_buffer.cur);
meta->key = MetaContent;
strcpy(meta->strval, text_buf.dyn_buffer.buf);
text_buffer_destroy(&text_buf);
free(intermediate_buf);
APPEND_META(doc, meta)
}

8
src/parsing/text.h Normal file
View File

@@ -0,0 +1,8 @@
#ifndef SIST2_TEXT_H
#define SIST2_TEXT_H
#include "src/sist.h"
void parse_text(int bytes_read, int *fd, char *buf, document_t *doc);
#endif