text component

This commit is contained in:
simon987 2020-03-25 09:44:34 -04:00
commit a24d4dc538
11 changed files with 511 additions and 0 deletions

8
.gitignore vendored Normal file
View File

@ -0,0 +1,8 @@
.idea/
cmake_install.cmake
Makefile
libscan.a
libscan.so
*.cbp
CMakeFiles
CMakeCache.txt

6
.gitmodules vendored Normal file
View File

@ -0,0 +1,6 @@
[submodule "third-party/uuid"]
path = third-party/uuid
url = https://github.com/certik/uuid
[submodule "third-party/utf8.h"]
path = third-party/utf8.h
url = https://github.com/sheredom/utf8.h

33
CMakeLists.txt Normal file
View File

@ -0,0 +1,33 @@
cmake_minimum_required(VERSION 3.15)
project(scan C)
set(CMAKE_C_STANDARD 11)
add_library(
scan
src/text.c src/text.h
src/util.c src/util.h
src/types.h
src/macros.h
third-party/utf8.h
)
# Third party
add_subdirectory(third-party/uuid/)
target_include_directories(
scan PRIVATE
third-party/uuid/src/
)
target_compile_options(
scan PRIVATE
-Werror
)
target_link_libraries(
scan
uuid
)

13
src/macros.h Normal file
View File

@ -0,0 +1,13 @@
#ifndef FALSE
#define FALSE (0)
#endif
#ifndef TRUE
#define TRUE (!FALSE)
#endif
#undef MAX
#define MAX(a, b) (((a) > (b)) ? (a) : (b))
#undef MIN
#define MIN(a, b) (((a) < (b)) ? (a) : (b))

29
src/text.c Normal file
View File

@ -0,0 +1,29 @@
#include "text.h"
scan_code_t parse_text(scan_text_ctx_t *ctx, struct vfile *f, document_t *doc) {
int to_read = MIN(ctx->content_size, doc->size);
char *buf = malloc(to_read);
int ret = f->read(f, buf, to_read);
if (ret < 0) {
//TODO: log
return SCAN_ERR_READ;
}
text_buffer_t tex = text_buffer_create(ctx->content_size);
text_buffer_append_string(&tex, buf, to_read);
text_buffer_terminate_string(&tex);
meta_line_t *meta = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur);
meta->key = MetaContent;
strcpy(meta->str_val, tex.dyn_buffer.buf);
APPEND_META(doc, meta)
free(buf);
text_buffer_destroy(&tex);
return SCAN_OK;
}

13
src/text.h Normal file
View File

@ -0,0 +1,13 @@
#ifndef SCAN_TEXT_H
#define SCAN_TEXT_H
#include "types.h"
#include "util.h"
typedef struct {
long content_size;
} scan_text_ctx_t;
scan_code_t parse_text(scan_text_ctx_t *ctx, struct vfile *f, document_t *doc);
#endif

124
src/types.h Normal file
View File

@ -0,0 +1,124 @@
#ifndef SCAN_TYPES_H
#define SCAN_TYPES_H
#include <stdio.h>
#include <sys/stat.h>
#include "uuid.h"
// TODO
#define ARC_MODE_SKIP 0
#define ARC_MODE_LIST 1
#define ARC_MODE_SHALLOW 2
#define ARC_MODE_RECURSE 3
typedef int archive_mode_t;
#define META_INT_MASK 0x80
#define META_STR_MASK 0x40
#define META_LONG_MASK 0x20
#define META_STR(id) ((unsigned) id) | ((unsigned) META_STR_MASK)
#define META_INT(id) ((unsigned) id) | ((unsigned) META_INT_MASK)
#define META_LONG(id) ((unsigned) id) | ((unsigned) META_LONG_MASK)
#define IS_META_INT(key) (key & META_INT_MASK) == META_INT_MASK
#define IS_META_LONG(key) (key & META_LONG_MASK) == META_LONG_MASK
#define IS_META_STR(meta) (meta->key & META_STR_MASK) == META_STR_MASK
typedef int scan_code_t;
#define SCAN_OK (scan_code_t) 0
#define SCAN_ERR_READ (scan_code_t) -1
// This is written to file as a 16-bit int!
enum metakey {
MetaContent = META_STR(1),
MetaWidth = META_INT(2),
MetaHeight = META_INT(3),
MetaMediaDuration = META_LONG(4),
MetaMediaAudioCodec = META_INT(5),
MetaMediaVideoCodec = META_INT(6),
MetaMediaBitrate = META_LONG(7),
MetaArtist = META_STR(8),
MetaAlbum = META_STR(9),
MetaAlbumArtist = META_STR(10),
MetaGenre = META_STR(11),
MetaTitle = META_STR(12),
MetaFontName = META_STR(13),
MetaParent = META_STR(14),
MetaExifMake = META_STR(15),
MetaExifSoftware = META_STR(16),
MetaExifExposureTime = META_STR(17),
MetaExifFNumber = META_STR(18),
MetaExifFocalLength = META_STR(19),
MetaExifUserComment = META_STR(20),
MetaExifModel = META_STR(21),
MetaExifIsoSpeedRatings = META_STR(22),
MetaExifDateTime = META_STR(23),
};
typedef struct meta_line {
struct meta_line *next;
enum metakey key;
union {
char str_val[0];
int int_val;
unsigned long long_val;
};
} meta_line_t;
typedef struct document {
unsigned char uuid[16];
unsigned long ino;
unsigned long size;
unsigned int mime;
int mtime;
short base;
short ext;
meta_line_t *meta_head;
meta_line_t *meta_tail;
char *filepath;
} document_t;
typedef struct vfile vfile_t;
__attribute__((warn_unused_result))
typedef int (*read_func_t)(struct vfile *, void *buf, size_t size);
typedef void (*close_func_t)(struct vfile *);
typedef struct vfile {
union {
int fd;
struct archive *arc;
};
int is_fs_file;
char *filepath;
read_func_t read;
close_func_t close;
} vfile_t;
typedef struct parse_job_t {
int base;
int ext;
struct stat info;
struct vfile vfile;
uuid_t parent;
char filepath[1];
} parse_job_t;
#define APPEND_META(doc, meta) \
meta->next = NULL;\
if (doc->meta_head == NULL) {\
doc->meta_head = meta;\
doc->meta_tail = doc->meta_head;\
} else {\
doc->meta_tail->next = meta;\
doc->meta_tail = meta;\
}
#endif

0
src/util.c Normal file
View File

283
src/util.h Normal file
View File

@ -0,0 +1,283 @@
#ifndef SIST2_UTIL_H
#define SIST2_UTIL_H
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "../third-party/utf8.h/utf8.h"
#include "macros.h"
#define TEXT_BUF_FULL -1
#define INITIAL_BUF_SIZE 1024 * 16
#define SHOULD_IGNORE_CHAR(c) !(SHOULD_KEEP_CHAR(c))
#define SHOULD_KEEP_CHAR(c) ((c >= '\'' && c <= ';') || (c >= 'A' && c <= 'z') || (c > 127))
typedef struct dyn_buffer {
char *buf;
size_t cur;
size_t size;
} dyn_buffer_t;
typedef struct text_buffer {
long max_size;
int last_char_was_whitespace;
dyn_buffer_t dyn_buffer;
} text_buffer_t;
__always_inline
static int utf8_validchr2(const char *s) {
if (0x00 == (0x80 & *s)) {
return TRUE;
} else if (0xf0 == (0xf8 & *s)) {
if ((0x80 != (0xc0 & s[1])) || (0x80 != (0xc0 & s[2])) ||
(0x80 != (0xc0 & s[3]))) {
return FALSE;
}
if (0x80 == (0xc0 & s[4])) {
return FALSE;
}
if ((0 == (0x07 & s[0])) && (0 == (0x30 & s[1]))) {
return FALSE;
}
} else if (0xe0 == (0xf0 & *s)) {
if ((0x80 != (0xc0 & s[1])) || (0x80 != (0xc0 & s[2]))) {
return FALSE;
}
if (0x80 == (0xc0 & s[3])) {
return FALSE;
}
if ((0 == (0x0f & s[0])) && (0 == (0x20 & s[1]))) {
return FALSE;
}
} else if (0xc0 == (0xe0 & *s)) {
if (0x80 != (0xc0 & s[1])) {
return FALSE;
}
if (0x80 == (0xc0 & s[2])) {
return FALSE;
}
if (0 == (0x1e & s[0])) {
return FALSE;
}
} else {
return FALSE;
}
return TRUE;
}
__always_inline
static dyn_buffer_t dyn_buffer_create() {
dyn_buffer_t buf;
buf.size = INITIAL_BUF_SIZE;
buf.cur = 0;
buf.buf = malloc(INITIAL_BUF_SIZE);
return buf;
}
__always_inline
static void grow_buffer(dyn_buffer_t *buf, size_t size) {
if (buf->cur + size > buf->size) {
do {
buf->size *= 2;
} while (buf->cur + size > buf->size);
buf->buf = realloc(buf->buf, buf->size);
}
}
__always_inline
static void grow_buffer_small(dyn_buffer_t *buf) {
if (buf->cur + sizeof(long) > buf->size) {
buf->size *= 2;
buf->buf = realloc(buf->buf, buf->size);
}
}
__always_inline
static void dyn_buffer_write(dyn_buffer_t *buf, const void *data, size_t size) {
grow_buffer(buf, size);
memcpy(buf->buf + buf->cur, data, size);
buf->cur += size;
}
__always_inline
static void dyn_buffer_write_char(dyn_buffer_t *buf, char c) {
grow_buffer_small(buf);
*(buf->buf + buf->cur) = c;
buf->cur += sizeof(c);
}
__always_inline
static void dyn_buffer_write_str(dyn_buffer_t *buf, char *str) {
dyn_buffer_write(buf, str, strlen(str));
dyn_buffer_write_char(buf, '\0');
}
__always_inline
static void dyn_buffer_append_string(dyn_buffer_t *buf, char *str) {
dyn_buffer_write(buf, str, strlen(str));
}
__always_inline
static void dyn_buffer_write_int(dyn_buffer_t *buf, int d) {
grow_buffer_small(buf);
*(int *) (buf->buf + buf->cur) = d;
buf->cur += sizeof(int);
}
__always_inline
static void dyn_buffer_write_short(dyn_buffer_t *buf, short s) {
grow_buffer_small(buf);
*(short *) (buf->buf + buf->cur) = s;
buf->cur += sizeof(short);
}
__always_inline
static void dyn_buffer_write_long(dyn_buffer_t *buf, unsigned long l) {
grow_buffer_small(buf);
*(unsigned long *) (buf->buf + buf->cur) = l;
buf->cur += sizeof(unsigned long);
}
__always_inline
static void dyn_buffer_destroy(dyn_buffer_t *buf) {
free(buf->buf);
}
__always_inline
static void text_buffer_destroy(text_buffer_t *buf) {
dyn_buffer_destroy(&buf->dyn_buffer);
}
__always_inline
static text_buffer_t text_buffer_create(long max_size) {
text_buffer_t text_buf;
text_buf.dyn_buffer = dyn_buffer_create();
text_buf.max_size = max_size;
text_buf.last_char_was_whitespace = FALSE;
return text_buf;
}
__always_inline
static int text_buffer_append_char(text_buffer_t *buf, int c) {
if (SHOULD_IGNORE_CHAR(c) || c == ' ') {
if (!buf->last_char_was_whitespace && buf->dyn_buffer.cur != 0) {
dyn_buffer_write_char(&buf->dyn_buffer, ' ');
buf->last_char_was_whitespace = TRUE;
if (buf->max_size > 0 && buf->dyn_buffer.cur >= buf->max_size) {
return TEXT_BUF_FULL;
}
}
} else {
buf->last_char_was_whitespace = FALSE;
grow_buffer_small(&buf->dyn_buffer);
if (((utf8_int32_t) 0xffffff80 & c) == 0) {
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = (char) c;
} else if (((utf8_int32_t) 0xfffff800 & c) == 0) {
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0xc0 | (char) (c >> 6);
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) (c & 0x3f);
} else if (((utf8_int32_t) 0xffff0000 & c) == 0) {
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0xe0 | (char) (c >> 12);
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) ((c >> 6) & 0x3f);
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) (c & 0x3f);
} else {
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0xf0 | (char) (c >> 18);
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) ((c >> 12) & 0x3f);
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) ((c >> 6) & 0x3f);
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) (c & 0x3f);
}
if (buf->max_size > 0 && buf->dyn_buffer.cur >= buf->max_size) {
return TEXT_BUF_FULL;
}
}
return 0;
}
__always_inline
static void text_buffer_terminate_string(text_buffer_t *buf) {
if (buf->dyn_buffer.cur > 0 && *(buf->dyn_buffer.buf + buf->dyn_buffer.cur - 1) == ' ') {
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur - 1) = '\0';
} else {
dyn_buffer_write_char(&buf->dyn_buffer, '\0');
}
}
#define UTF8_END_OF_STRING \
(ptr - str >= len || *ptr == 0 || \
(0xc0 == (0xe0 & *ptr) && ptr - str > len - 2) || \
(0xe0 == (0xf0 & *ptr) && ptr - str > len - 3) || \
(0xf0 == (0xf8 & *ptr) && ptr - str > len - 4))
__always_inline
static int text_buffer_append_string(text_buffer_t *buf, const char *str, size_t len) {
const char *ptr = str;
const char *oldPtr = ptr;
if (str == NULL || UTF8_END_OF_STRING) {
return 0;
}
if (len <= 4) {
for (int i = 0; i < len; i++) {
if (((utf8_int32_t)0xffffff80 & str[i]) == 0) {
dyn_buffer_write_char(&buf->dyn_buffer, str[i]);
}
}
return 0;
}
utf8_int32_t c;
char tmp[16];
do {
ptr = utf8codepoint(ptr, &c);
*(int *) tmp = 0x00000000;
memcpy(tmp, oldPtr, ptr - oldPtr);
oldPtr = ptr;
if (!utf8_validchr2(tmp)) {
continue;
}
int ret = text_buffer_append_char(buf, c);
if (ret != 0) {
return ret;
}
} while (!UTF8_END_OF_STRING);
return 0;
}
__always_inline
static int text_buffer_append_string0(text_buffer_t *buf, char *str) {
return text_buffer_append_string(buf, str, strlen(str));
}
#endif

1
third-party/utf8.h vendored Submodule

@ -0,0 +1 @@
Subproject commit b686b0c5181c2dd9f8297e6ac3692c9614b083be

1
third-party/uuid vendored Submodule

@ -0,0 +1 @@
Subproject commit f895102e2ddaf86387a62c3544abb78c0a5cfbae