mirror of
https://github.com/simon987/sist2.git
synced 2025-12-14 15:59:03 +00:00
Initial commit (squashed)
This commit is contained in:
211
src/parsing/font.c
Normal file
211
src/parsing/font.c
Normal file
@@ -0,0 +1,211 @@
|
||||
#include "font.h"
|
||||
|
||||
#include "ft2build.h"
|
||||
#include "freetype/freetype.h"
|
||||
|
||||
#include "src/ctx.h"
|
||||
|
||||
__thread FT_Library library = NULL;
|
||||
|
||||
|
||||
typedef struct text_dimensions {
|
||||
unsigned int width;
|
||||
unsigned int height;
|
||||
unsigned int baseline;
|
||||
} text_dimensions_t;
|
||||
|
||||
typedef struct glyph {
|
||||
unsigned int top;
|
||||
unsigned int height;
|
||||
unsigned int width;
|
||||
unsigned int descent;
|
||||
unsigned int ascent;
|
||||
unsigned int advance_width;
|
||||
unsigned char *pixmap;
|
||||
} glyph_t;
|
||||
|
||||
|
||||
__always_inline
|
||||
int kerning_offset(char c, char pc, FT_Face face) {
|
||||
FT_Vector kerning;
|
||||
FT_Get_Kerning(face, c, pc, FT_KERNING_DEFAULT, &kerning);
|
||||
|
||||
return (int) (kerning.x / 64);
|
||||
}
|
||||
|
||||
__always_inline
|
||||
glyph_t ft_glyph_to_glyph(FT_GlyphSlot slot) {
|
||||
glyph_t glyph;
|
||||
|
||||
glyph.pixmap = slot->bitmap.buffer;
|
||||
|
||||
glyph.width = slot->bitmap.width;
|
||||
glyph.height = slot->bitmap.rows;
|
||||
glyph.top = slot->bitmap_top;
|
||||
glyph.advance_width = slot->advance.x / 64;
|
||||
|
||||
glyph.descent = MAX(0, glyph.height - glyph.top);
|
||||
glyph.ascent = MAX(0, MAX(glyph.top, glyph.height) - glyph.descent);
|
||||
|
||||
return glyph;
|
||||
}
|
||||
|
||||
__always_inline
|
||||
glyph_t get_glyph(char character, FT_Face face) {
|
||||
}
|
||||
|
||||
text_dimensions_t text_dimension(char *text, FT_Face face) {
|
||||
text_dimensions_t dimensions;
|
||||
|
||||
dimensions.width = 0;
|
||||
|
||||
int num_chars = (int) strlen(text);
|
||||
|
||||
unsigned int max_ascent = 0;
|
||||
unsigned int max_descent = 0;
|
||||
|
||||
char pc = 0;
|
||||
for (int i = 0; i < num_chars; i++) {
|
||||
char c = text[i];
|
||||
|
||||
FT_Load_Char(face, c, 0);
|
||||
glyph_t glyph = ft_glyph_to_glyph(face->glyph);
|
||||
|
||||
max_descent = MAX(max_descent, glyph.descent);
|
||||
max_ascent = MAX(max_ascent, glyph.ascent);
|
||||
|
||||
int kerning_x = kerning_offset(c, pc, face);
|
||||
dimensions.width += MAX(glyph.advance_width, glyph.width) + kerning_x;
|
||||
|
||||
pc = c;
|
||||
}
|
||||
|
||||
dimensions.height = max_ascent + max_descent;
|
||||
dimensions.baseline = max_descent;
|
||||
|
||||
return dimensions;
|
||||
}
|
||||
|
||||
void draw_glyph(glyph_t *glyph, int x, int y, struct text_dimensions text_info, unsigned char *bitmap) {
|
||||
unsigned int src = 0;
|
||||
unsigned int dst = y * text_info.width + x;
|
||||
unsigned int row_offset = text_info.width - glyph->width;
|
||||
unsigned int buf_len = text_info.width * text_info.height;
|
||||
|
||||
for (unsigned int sy = 0; sy < glyph->height; sy++) {
|
||||
for (unsigned int sx = 0; sx < glyph->width; sx++) {
|
||||
if (dst < buf_len) {
|
||||
bitmap[dst] |= glyph->pixmap[src];
|
||||
}
|
||||
src++;
|
||||
dst++;
|
||||
}
|
||||
dst += row_offset;
|
||||
}
|
||||
}
|
||||
|
||||
void bmp_format(dyn_buffer_t *buf, text_dimensions_t dimensions, const unsigned char *bitmap) {
|
||||
|
||||
dyn_buffer_write_short(buf, 0x4D42); // Magic
|
||||
dyn_buffer_write_int(buf, 0); // Size placeholder
|
||||
dyn_buffer_write_int(buf, 0x5157); //Reserved
|
||||
dyn_buffer_write_int(buf, 14 + 40 + 256 * 4); // pixels offset
|
||||
|
||||
dyn_buffer_write_int(buf, 40); // DIB size
|
||||
dyn_buffer_write_int(buf, (int) dimensions.width);
|
||||
dyn_buffer_write_int(buf, (int) dimensions.height);
|
||||
dyn_buffer_write_short(buf, 1); // Color planes
|
||||
dyn_buffer_write_short(buf, 8); // bits per pixel
|
||||
dyn_buffer_write_int(buf, 0); // compression
|
||||
dyn_buffer_write_int(buf, 0); // Ignored
|
||||
dyn_buffer_write_int(buf, 3800); // hres
|
||||
dyn_buffer_write_int(buf, 3800); // vres
|
||||
dyn_buffer_write_int(buf, 256); // Color count
|
||||
dyn_buffer_write_int(buf, 0); // Ignored
|
||||
|
||||
// RGBA32 Color table (Grayscale)
|
||||
for (int i = 255; i >= 0; i--) {
|
||||
dyn_buffer_write_int(buf, i + (i << 8) + (i << 16));
|
||||
}
|
||||
|
||||
// Pixel array: write from bottom to top, with rows padded to multiples of 4-bytes
|
||||
for (int y = (int) dimensions.height - 1; y >= 0; y--) {
|
||||
for (unsigned int x = 0; x < dimensions.width; x++) {
|
||||
dyn_buffer_write_char(buf, (char) bitmap[y * dimensions.width + x]);
|
||||
}
|
||||
while (buf->cur % 4 != 0) {
|
||||
dyn_buffer_write_char(buf, 0);
|
||||
}
|
||||
}
|
||||
|
||||
// Size
|
||||
*(int *) ((char *) buf->buf + 2) = buf->cur;
|
||||
}
|
||||
|
||||
void parse_font(const char *buf, size_t buf_len, document_t *doc) {
|
||||
if (library == NULL) {
|
||||
FT_Init_FreeType(&library);
|
||||
}
|
||||
|
||||
FT_Face face;
|
||||
FT_Error err = FT_New_Memory_Face(library, (unsigned char *) buf, buf_len, 0, &face);
|
||||
if (err != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
char font_name[1024];
|
||||
|
||||
if (face->style_name == NULL || *(face->style_name) == '?') {
|
||||
strcpy(font_name, face->family_name);
|
||||
} else {
|
||||
snprintf(font_name, sizeof(font_name), "%s %s", face->family_name, face->style_name);
|
||||
}
|
||||
|
||||
meta_line_t *meta_name = malloc(sizeof(meta_line_t) + strlen(font_name));
|
||||
meta_name->key = MetaFontName;
|
||||
strcpy(meta_name->strval, font_name);
|
||||
APPEND_META(doc, meta_name)
|
||||
|
||||
int pixel = 64;
|
||||
int num_chars = (int) strlen(font_name);
|
||||
|
||||
err = FT_Set_Pixel_Sizes(face, 0, pixel);
|
||||
if (err != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
text_dimensions_t dimensions = text_dimension(font_name, face);
|
||||
unsigned char *bitmap = calloc(dimensions.width * dimensions.height, 1);
|
||||
|
||||
FT_Vector pen;
|
||||
pen.x = 0;
|
||||
|
||||
char pc = 0;
|
||||
for (int i = 0; i < num_chars; i++) {
|
||||
char c = font_name[i];
|
||||
|
||||
err = FT_Load_Char(face, c, FT_LOAD_NO_HINTING | FT_LOAD_RENDER);
|
||||
if (err != 0) {
|
||||
continue;
|
||||
}
|
||||
glyph_t glyph = ft_glyph_to_glyph(face->glyph);
|
||||
|
||||
pen.x += kerning_offset(c, pc, face);
|
||||
pen.y = dimensions.height - glyph.ascent - dimensions.baseline;
|
||||
|
||||
draw_glyph(&glyph, pen.x, pen.y, dimensions, bitmap);
|
||||
|
||||
pen.x += glyph.advance_width;
|
||||
pc = c;
|
||||
}
|
||||
|
||||
dyn_buffer_t bmp_data = dyn_buffer_create();
|
||||
bmp_format(&bmp_data, dimensions, bitmap);
|
||||
|
||||
store_write(ScanCtx.index.store, (char *) doc->uuid, sizeof(doc->uuid), (char *) bmp_data.buf, bmp_data.cur);
|
||||
|
||||
dyn_buffer_destroy(&bmp_data);
|
||||
free(bitmap);
|
||||
|
||||
FT_Done_Face(face);
|
||||
}
|
||||
9
src/parsing/font.h
Normal file
9
src/parsing/font.h
Normal file
@@ -0,0 +1,9 @@
|
||||
#ifndef SIST2_FONT_H
|
||||
#define SIST2_FONT_H
|
||||
|
||||
#include "src/sist.h"
|
||||
|
||||
|
||||
void parse_font(const char * buf, size_t buf_len, document_t *doc);
|
||||
|
||||
#endif
|
||||
269
src/parsing/media.c
Normal file
269
src/parsing/media.c
Normal file
@@ -0,0 +1,269 @@
|
||||
#include "src/sist.h"
|
||||
#include "src/ctx.h"
|
||||
|
||||
AVCodecContext *alloc_jpeg_encoder(int dstW, int dstH, float qscale) {
|
||||
|
||||
AVCodec *jpeg_codec = avcodec_find_encoder(AV_CODEC_ID_MJPEG);
|
||||
AVCodecContext *jpeg = avcodec_alloc_context3(jpeg_codec);
|
||||
jpeg->width = dstW;
|
||||
jpeg->height = dstH;
|
||||
jpeg->time_base.den = 1000000;
|
||||
jpeg->time_base.num = 1;
|
||||
jpeg->i_quant_factor = qscale;
|
||||
|
||||
jpeg->pix_fmt = AV_PIX_FMT_YUVJ420P;
|
||||
int ret = avcodec_open2(jpeg, jpeg_codec, NULL);
|
||||
|
||||
if (ret != 0) {
|
||||
printf("Could not open jpeg encoder: %s!\n", av_err2str(ret));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return jpeg;
|
||||
}
|
||||
|
||||
AVFrame *scale_frame(const AVCodecContext *decoder, const AVFrame *frame, int size) {
|
||||
AVFrame *scaled_frame = av_frame_alloc();
|
||||
|
||||
int dstW;
|
||||
int dstH;
|
||||
if (frame->width <= size && frame->height <= size) {
|
||||
dstW = frame->width;
|
||||
dstH = frame->height;
|
||||
} else {
|
||||
double ratio = (double) frame->width / frame->height;
|
||||
if (frame->width > frame->height) {
|
||||
dstW = size;
|
||||
dstH = (int) (size / ratio);
|
||||
} else {
|
||||
dstW = (int) (size * ratio);
|
||||
dstH = size;
|
||||
}
|
||||
}
|
||||
|
||||
struct SwsContext *ctx = sws_getContext(
|
||||
decoder->width, decoder->height, decoder->pix_fmt,
|
||||
dstW, dstH, AV_PIX_FMT_YUVJ420P,
|
||||
SWS_FAST_BILINEAR, 0, 0, 0
|
||||
);
|
||||
|
||||
int dst_buf_len = avpicture_get_size(AV_PIX_FMT_YUVJ420P, dstW, dstH);
|
||||
uint8_t *dst_buf = (uint8_t *) av_malloc(dst_buf_len);
|
||||
|
||||
avpicture_fill((AVPicture *) scaled_frame, dst_buf, AV_PIX_FMT_YUVJ420P, dstW, dstH);
|
||||
|
||||
sws_scale(ctx,
|
||||
(const uint8_t *const *) frame->data, frame->linesize,
|
||||
0, decoder->height,
|
||||
scaled_frame->data, scaled_frame->linesize
|
||||
);
|
||||
|
||||
scaled_frame->width = dstW;
|
||||
scaled_frame->height = dstH;
|
||||
scaled_frame->format = AV_PIX_FMT_YUV420P;
|
||||
|
||||
sws_freeContext(ctx);
|
||||
|
||||
return scaled_frame;
|
||||
}
|
||||
|
||||
AVFrame *read_frame(AVFormatContext *pFormatCtx, AVCodecContext *decoder, int stream_idx) {
|
||||
AVFrame *frame = av_frame_alloc();
|
||||
|
||||
AVPacket avPacket;
|
||||
av_init_packet(&avPacket);
|
||||
|
||||
int receive_ret = -EAGAIN;
|
||||
while (receive_ret == -EAGAIN) {
|
||||
// Get video frame
|
||||
while (1) {
|
||||
int read_frame_ret = av_read_frame(pFormatCtx, &avPacket);
|
||||
|
||||
if (read_frame_ret != 0) {
|
||||
if (read_frame_ret != AVERROR_EOF) {
|
||||
fprintf(stderr, "Error reading frame: %s\n", av_err2str(read_frame_ret));
|
||||
}
|
||||
av_frame_free(&frame);
|
||||
av_packet_unref(&avPacket);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
//Ignore audio/other frames
|
||||
if (avPacket.stream_index != stream_idx) {
|
||||
av_packet_unref(&avPacket);
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
// Feed it to decoder
|
||||
int decode_ret = avcodec_send_packet(decoder, &avPacket);
|
||||
if (decode_ret != 0) {
|
||||
printf("Error decoding frame: %s\n", av_err2str(decode_ret));
|
||||
}
|
||||
av_packet_unref(&avPacket);
|
||||
receive_ret = avcodec_receive_frame(decoder, frame);
|
||||
}
|
||||
return frame;
|
||||
}
|
||||
|
||||
void append_audio_meta(AVFormatContext *pFormatCtx, document_t *doc) {
|
||||
|
||||
AVDictionaryEntry *tag = NULL;
|
||||
while ((tag = av_dict_get(pFormatCtx->metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) {
|
||||
char *key = tag->key;
|
||||
for (; *key; ++key) *key = (char) tolower(*key);
|
||||
|
||||
if (strcmp(tag->key, "artist") == 0) {
|
||||
size_t len = strlen(tag->value);
|
||||
meta_line_t *meta_tag = malloc(sizeof(meta_line_t) + len);
|
||||
meta_tag->key = MetaArtist;
|
||||
memcpy(meta_tag->strval, tag->value, len);
|
||||
APPEND_META(doc, meta_tag)
|
||||
} else if (strcmp(tag->key, "genre") == 0) {
|
||||
size_t len = strlen(tag->value);
|
||||
meta_line_t *meta_tag = malloc(sizeof(meta_line_t) + len);
|
||||
meta_tag->key = MetaGenre;
|
||||
memcpy(meta_tag->strval, tag->value, len);
|
||||
APPEND_META(doc, meta_tag)
|
||||
} else if (strcmp(tag->key, "title") == 0) {
|
||||
size_t len = strlen(tag->value);
|
||||
meta_line_t *meta_tag = malloc(sizeof(meta_line_t) + len);
|
||||
meta_tag->key = MetaTitle;
|
||||
memcpy(meta_tag->strval, tag->value, len);
|
||||
APPEND_META(doc, meta_tag)
|
||||
} else if (strcmp(tag->key, "album_artist") == 0) {
|
||||
size_t len = strlen(tag->value);
|
||||
meta_line_t *meta_tag = malloc(sizeof(meta_line_t) + len);
|
||||
meta_tag->key = MetaAlbumArtist;
|
||||
memcpy(meta_tag->strval, tag->value, len);
|
||||
APPEND_META(doc, meta_tag)
|
||||
} else if (strcmp(tag->key, "album") == 0) {
|
||||
size_t len = strlen(tag->value);
|
||||
meta_line_t *meta_tag = malloc(sizeof(meta_line_t) + len);
|
||||
meta_tag->key = MetaAlbum;
|
||||
memcpy(meta_tag->strval, tag->value, len);
|
||||
APPEND_META(doc, meta_tag)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void parse_media(const char *filepath, document_t *doc) {
|
||||
|
||||
int video_stream = -1;
|
||||
|
||||
AVFormatContext *pFormatCtx = avformat_alloc_context();
|
||||
int res = avformat_open_input(&pFormatCtx, filepath, NULL, NULL);
|
||||
if (res < 0) {
|
||||
printf("ERR%s %s\n", filepath, av_err2str(res));
|
||||
return;
|
||||
}
|
||||
|
||||
avformat_find_stream_info(pFormatCtx, NULL);
|
||||
|
||||
for (int i = (int) pFormatCtx->nb_streams - 1; i >= 0; i--) {
|
||||
AVStream *stream = pFormatCtx->streams[i];
|
||||
|
||||
if (stream->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
|
||||
meta_line_t *meta_audio = malloc(sizeof(meta_line_t));
|
||||
meta_audio->key = MetaMediaAudioCodec;
|
||||
meta_audio->intval = stream->codecpar->codec_id;
|
||||
APPEND_META(doc, meta_audio)
|
||||
|
||||
append_audio_meta(pFormatCtx, doc);
|
||||
|
||||
} else if (stream->codecpar->codec_type == AVMEDIA_TYPE_VIDEO) {
|
||||
|
||||
meta_line_t *meta_vid = malloc(sizeof(meta_line_t));
|
||||
meta_vid->key = MetaMediaVideoCodec;
|
||||
meta_vid->intval = stream->codecpar->codec_id;
|
||||
APPEND_META(doc, meta_vid)
|
||||
|
||||
meta_line_t *meta_w = malloc(sizeof(meta_line_t));
|
||||
meta_w->key = MetaWidth;
|
||||
meta_w->intval = stream->codecpar->width;
|
||||
APPEND_META(doc, meta_w)
|
||||
|
||||
meta_line_t *meta_h = malloc(sizeof(meta_line_t));
|
||||
meta_h->key = MetaHeight;
|
||||
meta_h->intval = stream->codecpar->height;
|
||||
APPEND_META(doc, meta_h)
|
||||
|
||||
video_stream = i;
|
||||
}
|
||||
}
|
||||
|
||||
if (video_stream != -1) {
|
||||
AVStream *stream = pFormatCtx->streams[video_stream];
|
||||
|
||||
if (stream->nb_frames > 1) {
|
||||
//This is a video (not a still image)
|
||||
meta_line_t *meta_duration = malloc(sizeof(meta_line_t));
|
||||
meta_duration->key = MetaMediaDuration;
|
||||
meta_duration->longval = pFormatCtx->duration / AV_TIME_BASE;
|
||||
APPEND_META(doc, meta_duration)
|
||||
|
||||
meta_line_t *meta_bitrate = malloc(sizeof(meta_line_t));
|
||||
meta_bitrate->key = MetaMediaBitrate;
|
||||
meta_bitrate->intval = pFormatCtx->bit_rate;
|
||||
APPEND_META(doc, meta_bitrate)
|
||||
}
|
||||
|
||||
if (stream->codecpar->width <= 20 || stream->codecpar->height <= 20) {
|
||||
avformat_close_input(&pFormatCtx);
|
||||
avformat_free_context(pFormatCtx);
|
||||
return;
|
||||
}
|
||||
|
||||
// Decoder
|
||||
AVCodec *video_codec = avcodec_find_decoder(stream->codecpar->codec_id);
|
||||
AVCodecContext *decoder = avcodec_alloc_context3(video_codec);
|
||||
avcodec_parameters_to_context(decoder, stream->codecpar);
|
||||
avcodec_open2(decoder, video_codec, NULL);
|
||||
|
||||
//Seek
|
||||
if (stream->nb_frames > 1 && stream->codecpar->codec_id != AV_CODEC_ID_GIF) {
|
||||
int seek_ret = 0;
|
||||
for (int i = 20; i >= 0; i--) {
|
||||
seek_ret = av_seek_frame(pFormatCtx, video_stream,
|
||||
stream->duration * 0.10, 0);
|
||||
if (seek_ret == 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
AVFrame *frame = read_frame(pFormatCtx, decoder, video_stream);
|
||||
if (frame == NULL) {
|
||||
avcodec_free_context(&decoder);
|
||||
avformat_close_input(&pFormatCtx);
|
||||
avformat_free_context(pFormatCtx);
|
||||
return;
|
||||
}
|
||||
|
||||
// Scale frame
|
||||
AVFrame *scaled_frame = scale_frame(decoder, frame, ScanCtx.tn_size);
|
||||
|
||||
// Encode frame to jpeg
|
||||
AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(scaled_frame->width, scaled_frame->height, ScanCtx.tn_qscale);
|
||||
avcodec_send_frame(jpeg_encoder, scaled_frame);
|
||||
|
||||
AVPacket jpeg_packet;
|
||||
av_init_packet(&jpeg_packet);
|
||||
avcodec_receive_packet(jpeg_encoder, &jpeg_packet);
|
||||
|
||||
// Save thumbnail
|
||||
store_write(ScanCtx.index.store, (char *) doc->uuid, sizeof(doc->uuid), (char *) jpeg_packet.data, jpeg_packet.size);
|
||||
|
||||
av_packet_unref(&jpeg_packet);
|
||||
av_frame_free(&frame);
|
||||
av_free(*scaled_frame->data);
|
||||
av_frame_free(&scaled_frame);
|
||||
avcodec_free_context(&jpeg_encoder);
|
||||
avcodec_free_context(&decoder);
|
||||
}
|
||||
|
||||
avformat_close_input(&pFormatCtx);
|
||||
avformat_free_context(pFormatCtx);
|
||||
}
|
||||
|
||||
11
src/parsing/media.h
Normal file
11
src/parsing/media.h
Normal file
@@ -0,0 +1,11 @@
|
||||
#ifndef SIST2_MEDIA_H
|
||||
#define SIST2_MEDIA_H
|
||||
|
||||
|
||||
#include "src/sist.h"
|
||||
|
||||
#define MIN_VIDEO_SIZE 1024 * 64
|
||||
|
||||
void parse_media(const char * filepath, document_t *doc);
|
||||
|
||||
#endif
|
||||
20
src/parsing/mime.c
Normal file
20
src/parsing/mime.c
Normal file
@@ -0,0 +1,20 @@
|
||||
#include "mime.h"
|
||||
|
||||
unsigned int mime_get_mime_by_ext(GHashTable *ext_table, const char * ext) {
|
||||
char lower[64];
|
||||
char *p = lower;
|
||||
while ((*ext)) {
|
||||
*p++ = (char)tolower(*ext++);
|
||||
}
|
||||
*p = '\0';
|
||||
return (size_t) g_hash_table_lookup(ext_table, lower);
|
||||
}
|
||||
|
||||
unsigned int mime_get_mime_by_string(GHashTable *mime_table, const char * str) {
|
||||
|
||||
const char * ptr = str;
|
||||
while (*ptr == ' ' || *ptr == '[') {
|
||||
ptr++;
|
||||
}
|
||||
return (size_t) g_hash_table_lookup(mime_table, ptr);
|
||||
}
|
||||
45
src/parsing/mime.h
Normal file
45
src/parsing/mime.h
Normal file
@@ -0,0 +1,45 @@
|
||||
#ifndef SIST2_MIME_H
|
||||
#define SIST2_MIME_H
|
||||
|
||||
#include "src/sist.h"
|
||||
|
||||
#define MAJOR_MIME(mime_id) (mime_id & 0x0FFF0000) >> 16
|
||||
|
||||
#define MIME_EMPTY 1
|
||||
|
||||
#define DONT_PARSE 0x80000000
|
||||
#define SHOULD_PARSE(mime_id) (mime_id & DONT_PARSE) != DONT_PARSE
|
||||
|
||||
#define PDF_MASK 0x40000000
|
||||
#define IS_PDF(mime_id) (mime_id & PDF_MASK) == PDF_MASK
|
||||
|
||||
#define FONT_MASK 0x20000000
|
||||
#define IS_FONT(mime_id) (mime_id & FONT_MASK) == FONT_MASK
|
||||
|
||||
enum major_mime {
|
||||
MimeInvalid = 0,
|
||||
MimeModel = 1,
|
||||
MimeExample = 2,
|
||||
MimeMessage = 3,
|
||||
MimeMultipart = 4,
|
||||
MimeFont = 5,
|
||||
MimeVideo = 6,
|
||||
MimeAudio = 7,
|
||||
MimeImage = 8,
|
||||
MimeText = 9,
|
||||
MimeApplication = 10,
|
||||
};
|
||||
|
||||
enum mime;
|
||||
|
||||
GHashTable *mime_get_mime_table();
|
||||
|
||||
GHashTable *mime_get_ext_table();
|
||||
|
||||
char *mime_get_mime_text(unsigned int);
|
||||
|
||||
unsigned int mime_get_mime_by_ext(GHashTable *ext_table, const char * ext);
|
||||
|
||||
unsigned int mime_get_mime_by_string(GHashTable *mime_table, const char * str);
|
||||
|
||||
#endif
|
||||
1552
src/parsing/mime_generated.c
Normal file
1552
src/parsing/mime_generated.c
Normal file
File diff suppressed because it is too large
Load Diff
126
src/parsing/parse.c
Normal file
126
src/parsing/parse.c
Normal file
@@ -0,0 +1,126 @@
|
||||
#include "src/sist.h"
|
||||
#include "src/ctx.h"
|
||||
|
||||
__thread magic_t Magic;
|
||||
|
||||
void *read_all(parse_job_t *job, const char *buf, int bytes_read, int *fd) {
|
||||
|
||||
void *full_buf;
|
||||
|
||||
if (job->info.st_size <= bytes_read) {
|
||||
full_buf = malloc(job->info.st_size);
|
||||
memcpy(full_buf, buf, job->info.st_size);
|
||||
} else {
|
||||
if (*fd == -1) {
|
||||
*fd = open(job->filepath, O_RDONLY);
|
||||
if (*fd == -1) {
|
||||
perror("open");
|
||||
printf("%s\n", job->filepath);
|
||||
free(job);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
full_buf = malloc(job->info.st_size);
|
||||
memcpy(full_buf, buf, bytes_read);
|
||||
int ret = read(*fd, full_buf + bytes_read, job->info.st_size - bytes_read);
|
||||
if (ret == -1) {
|
||||
perror("read");
|
||||
}
|
||||
}
|
||||
|
||||
return full_buf;
|
||||
}
|
||||
|
||||
void parse(void *arg) {
|
||||
|
||||
parse_job_t *job = arg;
|
||||
document_t doc;
|
||||
|
||||
if (incremental_get(ScanCtx.original_table, job->info.st_ino) == job->info.st_mtim.tv_sec) {
|
||||
incremental_mark_file_for_copy(ScanCtx.copy_table, job->info.st_ino);
|
||||
free(job);
|
||||
return;
|
||||
}
|
||||
|
||||
if (Magic == NULL) {
|
||||
Magic = magic_open(MAGIC_MIME_TYPE);
|
||||
magic_load(Magic, NULL);
|
||||
}
|
||||
|
||||
doc.filepath = job->filepath;
|
||||
doc.ext = (short) job->ext;
|
||||
doc.base = (short) job->base;
|
||||
doc.meta_head = NULL;
|
||||
doc.meta_tail = NULL;
|
||||
doc.mime = 0;
|
||||
doc.size = job->info.st_size;
|
||||
doc.ino = job->info.st_ino;
|
||||
doc.mtime = job->info.st_mtim.tv_sec;
|
||||
|
||||
uuid_generate_time_safe(doc.uuid);
|
||||
char *buf[PARSE_BUF_SIZE];
|
||||
|
||||
if (job->info.st_size == 0) {
|
||||
doc.mime = MIME_EMPTY;
|
||||
} else if (*(job->filepath + job->ext) != '\0') {
|
||||
doc.mime = mime_get_mime_by_ext(ScanCtx.ext_table, job->filepath + job->ext);
|
||||
}
|
||||
|
||||
int fd = -1;
|
||||
int bytes_read = 0;
|
||||
|
||||
if (doc.mime == 0) {
|
||||
// Get mime type with libmagic
|
||||
fd = open(job->filepath, O_RDONLY);
|
||||
if (fd == -1) {
|
||||
perror("open");
|
||||
free(job);
|
||||
return;
|
||||
}
|
||||
|
||||
bytes_read = read(fd, buf, PARSE_BUF_SIZE);
|
||||
|
||||
const char *magic_mime_str = magic_buffer(Magic, buf, bytes_read);
|
||||
if (magic_mime_str != NULL) {
|
||||
doc.mime = mime_get_mime_by_string(ScanCtx.mime_table, magic_mime_str);
|
||||
if (doc.mime == 0) {
|
||||
fprintf(stderr, "Couldn't find mime %s, %s!\n", magic_mime_str, job->filepath + job->base);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int mmime = MAJOR_MIME(doc.mime);
|
||||
|
||||
if (!(SHOULD_PARSE(doc.mime))) {
|
||||
|
||||
} else if ((mmime == MimeVideo && doc.size >= MIN_VIDEO_SIZE) || mmime == MimeAudio || mmime == MimeImage) {
|
||||
parse_media(job->filepath, &doc);
|
||||
|
||||
} else if (IS_PDF(doc.mime)) {
|
||||
void *pdf_buf = read_all(job, (char *) buf, bytes_read, &fd);
|
||||
parse_pdf(pdf_buf, doc.size, &doc);
|
||||
|
||||
if (pdf_buf != buf) {
|
||||
free(pdf_buf);
|
||||
}
|
||||
|
||||
} else if (mmime == MimeText && ScanCtx.content_size > 0) {
|
||||
parse_text(bytes_read, &fd, (char *) buf, &doc);
|
||||
|
||||
} else if (IS_FONT(doc.mime)) {
|
||||
void *font_buf = read_all(job, (char *) buf, bytes_read, &fd);
|
||||
parse_font(font_buf, doc.size, &doc);
|
||||
|
||||
if (font_buf != buf) {
|
||||
free(font_buf);
|
||||
}
|
||||
}
|
||||
|
||||
write_document(&doc);
|
||||
|
||||
if (fd != -1) {
|
||||
close(fd);
|
||||
}
|
||||
|
||||
free(job);
|
||||
}
|
||||
10
src/parsing/parse.h
Normal file
10
src/parsing/parse.h
Normal file
@@ -0,0 +1,10 @@
|
||||
#ifndef SIST2_PARSE_H
|
||||
#define SIST2_PARSE_H
|
||||
|
||||
#include "src/sist.h"
|
||||
|
||||
#define PARSE_BUF_SIZE 4096
|
||||
|
||||
void parse(void *arg);
|
||||
|
||||
#endif
|
||||
114
src/parsing/pdf.c
Normal file
114
src/parsing/pdf.c
Normal file
@@ -0,0 +1,114 @@
|
||||
#include "pdf.h"
|
||||
#include "src/ctx.h"
|
||||
|
||||
fz_page *render_cover(fz_context *ctx, document_t *doc, fz_document *fzdoc) {
|
||||
|
||||
fz_page *cover = fz_load_page(ctx, fzdoc, 0);
|
||||
fz_rect bounds = fz_bound_page(ctx, cover);
|
||||
|
||||
float scale;
|
||||
unsigned char *tn_buf;
|
||||
float w = (float) bounds.x1 - bounds.x0;
|
||||
float h = (float) bounds.y1 - bounds.y0;
|
||||
if (w > h) {
|
||||
scale = (float) ScanCtx.tn_size / w;
|
||||
} else {
|
||||
scale = (float) ScanCtx.tn_size / h;
|
||||
}
|
||||
fz_matrix m = fz_scale(scale, scale);
|
||||
|
||||
fz_pixmap *pixmap;
|
||||
fz_colorspace *color_space = fz_device_rgb(ctx);
|
||||
pixmap = fz_new_pixmap_from_page(ctx, cover, m, color_space, 0);
|
||||
|
||||
fz_buffer *fzbuf = fz_new_buffer_from_pixmap_as_png(ctx, pixmap, fz_default_color_params);
|
||||
size_t tn_len = fz_buffer_storage(ctx, fzbuf, &tn_buf);
|
||||
|
||||
store_write(ScanCtx.index.store, (char *) doc->uuid, sizeof(doc->uuid), (char *) tn_buf, tn_len);
|
||||
|
||||
fz_drop_pixmap(ctx, pixmap);
|
||||
fz_drop_buffer(ctx, fzbuf);
|
||||
|
||||
return cover;
|
||||
}
|
||||
|
||||
|
||||
void fz_noop_callback(void *user, const char *message) {
|
||||
}
|
||||
|
||||
void parse_pdf(void *buf, size_t buf_len, document_t *doc) {
|
||||
|
||||
//TODO error handling
|
||||
fz_context *ctx = fz_new_context(NULL, NULL, FZ_STORE_UNLIMITED);
|
||||
fz_try(ctx)
|
||||
{
|
||||
fz_register_document_handlers(ctx);
|
||||
|
||||
ctx->warn.print = fz_noop_callback; //disable warnings
|
||||
ctx->error.print = fz_noop_callback;
|
||||
|
||||
fz_stream *stream = fz_open_memory(ctx, buf, buf_len);
|
||||
fz_document *fzdoc = fz_open_document_with_stream(ctx, mime_get_mime_text(doc->mime), stream);
|
||||
|
||||
int page_count = fz_count_pages(ctx, fzdoc);
|
||||
fz_page *cover = render_cover(ctx, doc, fzdoc);
|
||||
|
||||
fz_stext_options opts;
|
||||
fz_parse_stext_options(ctx, &opts, "preserve-ligatures");
|
||||
|
||||
text_buffer_t text_buf = text_buffer_create(ScanCtx.content_size);
|
||||
|
||||
for (int current_page = 0; current_page < page_count; current_page++) {
|
||||
fz_page *page;
|
||||
if (current_page == 0) {
|
||||
page = cover;
|
||||
} else {
|
||||
page = fz_load_page(ctx, fzdoc, current_page);
|
||||
}
|
||||
fz_stext_page *stext = fz_new_stext_page_from_page(ctx, page, &opts);
|
||||
|
||||
fz_stext_block *block = stext->first_block;
|
||||
while (block != NULL) {
|
||||
|
||||
if (block->type != FZ_STEXT_BLOCK_TEXT) {
|
||||
continue;
|
||||
}
|
||||
|
||||
fz_stext_line *line = block->u.t.first_line;
|
||||
while (line != NULL) {
|
||||
fz_stext_char *c = line->first_char;
|
||||
while (c != NULL) {
|
||||
if (text_buffer_append_char(&text_buf, c->c) == TEXT_BUF_FULL) {
|
||||
fz_drop_page(ctx, page);
|
||||
fz_drop_stext_page(ctx, stext);
|
||||
goto write_loop_end;
|
||||
}
|
||||
c = c->next;
|
||||
}
|
||||
line = line->next;
|
||||
}
|
||||
block = block->next;
|
||||
}
|
||||
fz_drop_page(ctx, page);
|
||||
fz_drop_stext_page(ctx, stext);
|
||||
}
|
||||
write_loop_end:;
|
||||
text_buffer_terminate_string(&text_buf);
|
||||
|
||||
meta_line_t *meta_content = malloc(sizeof(meta_line_t) + text_buf.dyn_buffer.cur);
|
||||
meta_content->key = MetaContent;
|
||||
memcpy(meta_content->strval, text_buf.dyn_buffer.buf, text_buf.dyn_buffer.cur);
|
||||
text_buffer_destroy(&text_buf);
|
||||
APPEND_META(doc, meta_content)
|
||||
|
||||
fz_drop_stream(ctx, stream);
|
||||
fz_drop_document(ctx, fzdoc);
|
||||
fz_drop_context(ctx);
|
||||
|
||||
}
|
||||
fz_catch(ctx)
|
||||
{
|
||||
// printf("err");
|
||||
}
|
||||
}
|
||||
|
||||
9
src/parsing/pdf.h
Normal file
9
src/parsing/pdf.h
Normal file
@@ -0,0 +1,9 @@
|
||||
#ifndef SIST2_PDF_H
|
||||
#define SIST2_PDF_H
|
||||
|
||||
#include "src/sist.h"
|
||||
|
||||
|
||||
void parse_pdf(void *buf, size_t buf_len, document_t *doc);
|
||||
|
||||
#endif
|
||||
43
src/parsing/text.c
Normal file
43
src/parsing/text.c
Normal file
@@ -0,0 +1,43 @@
|
||||
#include "text.h"
|
||||
#include "src/ctx.h"
|
||||
|
||||
void parse_text(int bytes_read, int *fd, char *buf, document_t *doc) {
|
||||
|
||||
char *intermediate_buf;
|
||||
int intermediate_buf_len;
|
||||
|
||||
if (bytes_read == doc->size || bytes_read >= ScanCtx.content_size) {
|
||||
int to_copy = MIN(bytes_read, ScanCtx.content_size);
|
||||
intermediate_buf = malloc(to_copy);
|
||||
intermediate_buf_len = to_copy;
|
||||
memcpy(intermediate_buf, buf, to_copy);
|
||||
|
||||
} else {
|
||||
if (*fd == -1) {
|
||||
*fd = open(doc->filepath, O_RDONLY);
|
||||
}
|
||||
|
||||
int to_read = MIN(ScanCtx.content_size, doc->size) - bytes_read;
|
||||
|
||||
intermediate_buf = malloc(to_read + bytes_read);
|
||||
intermediate_buf_len = to_read + bytes_read;
|
||||
if (bytes_read != 0) {
|
||||
memcpy(intermediate_buf, buf, bytes_read);
|
||||
}
|
||||
|
||||
read(*fd, intermediate_buf + bytes_read, to_read);
|
||||
}
|
||||
|
||||
text_buffer_t text_buf = text_buffer_create(ScanCtx.content_size);
|
||||
for (int i = 0; i < intermediate_buf_len; i++) {
|
||||
text_buffer_append_char(&text_buf, *(intermediate_buf + i));
|
||||
}
|
||||
text_buffer_terminate_string(&text_buf);
|
||||
|
||||
meta_line_t *meta = malloc(sizeof(meta_line_t) + text_buf.dyn_buffer.cur);
|
||||
meta->key = MetaContent;
|
||||
strcpy(meta->strval, text_buf.dyn_buffer.buf);
|
||||
text_buffer_destroy(&text_buf);
|
||||
free(intermediate_buf);
|
||||
APPEND_META(doc, meta)
|
||||
}
|
||||
8
src/parsing/text.h
Normal file
8
src/parsing/text.h
Normal file
@@ -0,0 +1,8 @@
|
||||
#ifndef SIST2_TEXT_H
|
||||
#define SIST2_TEXT_H
|
||||
|
||||
#include "src/sist.h"
|
||||
|
||||
void parse_text(int bytes_read, int *fd, char *buf, document_t *doc);
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user