From a24d4dc53859d0dd4ad9dbc2f5ef9b2f14f84c6e Mon Sep 17 00:00:00 2001 From: simon987 Date: Wed, 25 Mar 2020 09:44:34 -0400 Subject: [PATCH] text component --- .gitignore | 8 ++ .gitmodules | 6 + CMakeLists.txt | 33 ++++++ src/macros.h | 13 +++ src/text.c | 29 +++++ src/text.h | 13 +++ src/types.h | 124 ++++++++++++++++++++ src/util.c | 0 src/util.h | 283 +++++++++++++++++++++++++++++++++++++++++++++ third-party/utf8.h | 1 + third-party/uuid | 1 + 11 files changed, 511 insertions(+) create mode 100644 .gitignore create mode 100644 .gitmodules create mode 100644 CMakeLists.txt create mode 100644 src/macros.h create mode 100644 src/text.c create mode 100644 src/text.h create mode 100644 src/types.h create mode 100644 src/util.c create mode 100644 src/util.h create mode 160000 third-party/utf8.h create mode 160000 third-party/uuid diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a7a837c --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +.idea/ +cmake_install.cmake +Makefile +libscan.a +libscan.so +*.cbp +CMakeFiles +CMakeCache.txt \ No newline at end of file diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..bad7d45 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,6 @@ +[submodule "third-party/uuid"] + path = third-party/uuid + url = https://github.com/certik/uuid +[submodule "third-party/utf8.h"] + path = third-party/utf8.h + url = https://github.com/sheredom/utf8.h diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..ac2dc6e --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,33 @@ +cmake_minimum_required(VERSION 3.15) +project(scan C) + +set(CMAKE_C_STANDARD 11) + + +add_library( + scan + src/text.c src/text.h + src/util.c src/util.h + src/types.h + src/macros.h + + third-party/utf8.h +) + + +# Third party +add_subdirectory(third-party/uuid/) +target_include_directories( + scan PRIVATE + third-party/uuid/src/ +) + +target_compile_options( + scan PRIVATE + -Werror +) + +target_link_libraries( + scan + uuid +) \ No newline at end of file diff --git a/src/macros.h b/src/macros.h new file mode 100644 index 0000000..5033f70 --- /dev/null +++ b/src/macros.h @@ -0,0 +1,13 @@ +#ifndef FALSE +#define FALSE (0) +#endif + +#ifndef TRUE +#define TRUE (!FALSE) +#endif + +#undef MAX +#define MAX(a, b) (((a) > (b)) ? (a) : (b)) + +#undef MIN +#define MIN(a, b) (((a) < (b)) ? (a) : (b)) diff --git a/src/text.c b/src/text.c new file mode 100644 index 0000000..7e43519 --- /dev/null +++ b/src/text.c @@ -0,0 +1,29 @@ +#include "text.h" + +scan_code_t parse_text(scan_text_ctx_t *ctx, struct vfile *f, document_t *doc) { + + int to_read = MIN(ctx->content_size, doc->size); + + char *buf = malloc(to_read); + int ret = f->read(f, buf, to_read); + if (ret < 0) { + //TODO: log + return SCAN_ERR_READ; + } + + text_buffer_t tex = text_buffer_create(ctx->content_size); + text_buffer_append_string(&tex, buf, to_read); + text_buffer_terminate_string(&tex); + + meta_line_t *meta = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur); + meta->key = MetaContent; + strcpy(meta->str_val, tex.dyn_buffer.buf); + + APPEND_META(doc, meta) + + free(buf); + text_buffer_destroy(&tex); + + return SCAN_OK; +} + diff --git a/src/text.h b/src/text.h new file mode 100644 index 0000000..bae8f51 --- /dev/null +++ b/src/text.h @@ -0,0 +1,13 @@ +#ifndef SCAN_TEXT_H +#define SCAN_TEXT_H + +#include "types.h" +#include "util.h" + +typedef struct { + long content_size; +} scan_text_ctx_t; + +scan_code_t parse_text(scan_text_ctx_t *ctx, struct vfile *f, document_t *doc); + +#endif diff --git a/src/types.h b/src/types.h new file mode 100644 index 0000000..7bf4913 --- /dev/null +++ b/src/types.h @@ -0,0 +1,124 @@ +#ifndef SCAN_TYPES_H +#define SCAN_TYPES_H + +#include +#include +#include "uuid.h" + + +// TODO +#define ARC_MODE_SKIP 0 +#define ARC_MODE_LIST 1 +#define ARC_MODE_SHALLOW 2 +#define ARC_MODE_RECURSE 3 +typedef int archive_mode_t; + +#define META_INT_MASK 0x80 +#define META_STR_MASK 0x40 +#define META_LONG_MASK 0x20 + +#define META_STR(id) ((unsigned) id) | ((unsigned) META_STR_MASK) +#define META_INT(id) ((unsigned) id) | ((unsigned) META_INT_MASK) +#define META_LONG(id) ((unsigned) id) | ((unsigned) META_LONG_MASK) + +#define IS_META_INT(key) (key & META_INT_MASK) == META_INT_MASK +#define IS_META_LONG(key) (key & META_LONG_MASK) == META_LONG_MASK +#define IS_META_STR(meta) (meta->key & META_STR_MASK) == META_STR_MASK + + +typedef int scan_code_t; +#define SCAN_OK (scan_code_t) 0 +#define SCAN_ERR_READ (scan_code_t) -1 + +// This is written to file as a 16-bit int! +enum metakey { + MetaContent = META_STR(1), + MetaWidth = META_INT(2), + MetaHeight = META_INT(3), + MetaMediaDuration = META_LONG(4), + MetaMediaAudioCodec = META_INT(5), + MetaMediaVideoCodec = META_INT(6), + MetaMediaBitrate = META_LONG(7), + MetaArtist = META_STR(8), + MetaAlbum = META_STR(9), + MetaAlbumArtist = META_STR(10), + MetaGenre = META_STR(11), + MetaTitle = META_STR(12), + MetaFontName = META_STR(13), + MetaParent = META_STR(14), + MetaExifMake = META_STR(15), + MetaExifSoftware = META_STR(16), + MetaExifExposureTime = META_STR(17), + MetaExifFNumber = META_STR(18), + MetaExifFocalLength = META_STR(19), + MetaExifUserComment = META_STR(20), + MetaExifModel = META_STR(21), + MetaExifIsoSpeedRatings = META_STR(22), + MetaExifDateTime = META_STR(23), +}; + +typedef struct meta_line { + struct meta_line *next; + enum metakey key; + union { + char str_val[0]; + int int_val; + unsigned long long_val; + }; +} meta_line_t; + + +typedef struct document { + unsigned char uuid[16]; + unsigned long ino; + unsigned long size; + unsigned int mime; + int mtime; + short base; + short ext; + meta_line_t *meta_head; + meta_line_t *meta_tail; + char *filepath; +} document_t; + +typedef struct vfile vfile_t; + +__attribute__((warn_unused_result)) +typedef int (*read_func_t)(struct vfile *, void *buf, size_t size); + +typedef void (*close_func_t)(struct vfile *); + +typedef struct vfile { + union { + int fd; + struct archive *arc; + }; + + int is_fs_file; + char *filepath; + + read_func_t read; + close_func_t close; +} vfile_t; + +typedef struct parse_job_t { + int base; + int ext; + struct stat info; + struct vfile vfile; + uuid_t parent; + char filepath[1]; +} parse_job_t; + + +#define APPEND_META(doc, meta) \ + meta->next = NULL;\ + if (doc->meta_head == NULL) {\ + doc->meta_head = meta;\ + doc->meta_tail = doc->meta_head;\ + } else {\ + doc->meta_tail->next = meta;\ + doc->meta_tail = meta;\ + } + +#endif diff --git a/src/util.c b/src/util.c new file mode 100644 index 0000000..e69de29 diff --git a/src/util.h b/src/util.h new file mode 100644 index 0000000..20c3426 --- /dev/null +++ b/src/util.h @@ -0,0 +1,283 @@ +#ifndef SIST2_UTIL_H +#define SIST2_UTIL_H + +#include +#include +#include +#include "../third-party/utf8.h/utf8.h" +#include "macros.h" + +#define TEXT_BUF_FULL -1 +#define INITIAL_BUF_SIZE 1024 * 16 + +#define SHOULD_IGNORE_CHAR(c) !(SHOULD_KEEP_CHAR(c)) +#define SHOULD_KEEP_CHAR(c) ((c >= '\'' && c <= ';') || (c >= 'A' && c <= 'z') || (c > 127)) + + +typedef struct dyn_buffer { + char *buf; + size_t cur; + size_t size; +} dyn_buffer_t; + +typedef struct text_buffer { + long max_size; + int last_char_was_whitespace; + dyn_buffer_t dyn_buffer; +} text_buffer_t; + +__always_inline +static int utf8_validchr2(const char *s) { + if (0x00 == (0x80 & *s)) { + return TRUE; + } else if (0xf0 == (0xf8 & *s)) { + if ((0x80 != (0xc0 & s[1])) || (0x80 != (0xc0 & s[2])) || + (0x80 != (0xc0 & s[3]))) { + return FALSE; + } + + if (0x80 == (0xc0 & s[4])) { + return FALSE; + } + + if ((0 == (0x07 & s[0])) && (0 == (0x30 & s[1]))) { + return FALSE; + } + } else if (0xe0 == (0xf0 & *s)) { + if ((0x80 != (0xc0 & s[1])) || (0x80 != (0xc0 & s[2]))) { + return FALSE; + } + + if (0x80 == (0xc0 & s[3])) { + return FALSE; + } + + if ((0 == (0x0f & s[0])) && (0 == (0x20 & s[1]))) { + return FALSE; + } + } else if (0xc0 == (0xe0 & *s)) { + if (0x80 != (0xc0 & s[1])) { + return FALSE; + } + + if (0x80 == (0xc0 & s[2])) { + return FALSE; + } + + if (0 == (0x1e & s[0])) { + return FALSE; + } + } else { + return FALSE; + } + + return TRUE; +} + + +__always_inline +static dyn_buffer_t dyn_buffer_create() { + dyn_buffer_t buf; + + buf.size = INITIAL_BUF_SIZE; + buf.cur = 0; + buf.buf = malloc(INITIAL_BUF_SIZE); + + return buf; +} + +__always_inline +static void grow_buffer(dyn_buffer_t *buf, size_t size) { + if (buf->cur + size > buf->size) { + do { + buf->size *= 2; + } while (buf->cur + size > buf->size); + + buf->buf = realloc(buf->buf, buf->size); + } +} + +__always_inline +static void grow_buffer_small(dyn_buffer_t *buf) { + if (buf->cur + sizeof(long) > buf->size) { + buf->size *= 2; + buf->buf = realloc(buf->buf, buf->size); + } +} + +__always_inline +static void dyn_buffer_write(dyn_buffer_t *buf, const void *data, size_t size) { + grow_buffer(buf, size); + + memcpy(buf->buf + buf->cur, data, size); + buf->cur += size; +} + +__always_inline +static void dyn_buffer_write_char(dyn_buffer_t *buf, char c) { + grow_buffer_small(buf); + + *(buf->buf + buf->cur) = c; + buf->cur += sizeof(c); +} + +__always_inline +static void dyn_buffer_write_str(dyn_buffer_t *buf, char *str) { + dyn_buffer_write(buf, str, strlen(str)); + dyn_buffer_write_char(buf, '\0'); +} + +__always_inline +static void dyn_buffer_append_string(dyn_buffer_t *buf, char *str) { + dyn_buffer_write(buf, str, strlen(str)); +} + +__always_inline +static void dyn_buffer_write_int(dyn_buffer_t *buf, int d) { + grow_buffer_small(buf); + + *(int *) (buf->buf + buf->cur) = d; + buf->cur += sizeof(int); +} + +__always_inline +static void dyn_buffer_write_short(dyn_buffer_t *buf, short s) { + grow_buffer_small(buf); + + *(short *) (buf->buf + buf->cur) = s; + buf->cur += sizeof(short); +} + +__always_inline +static void dyn_buffer_write_long(dyn_buffer_t *buf, unsigned long l) { + grow_buffer_small(buf); + + *(unsigned long *) (buf->buf + buf->cur) = l; + buf->cur += sizeof(unsigned long); +} + +__always_inline +static void dyn_buffer_destroy(dyn_buffer_t *buf) { + free(buf->buf); +} + +__always_inline +static void text_buffer_destroy(text_buffer_t *buf) { + dyn_buffer_destroy(&buf->dyn_buffer); +} + +__always_inline +static text_buffer_t text_buffer_create(long max_size) { + text_buffer_t text_buf; + + text_buf.dyn_buffer = dyn_buffer_create(); + text_buf.max_size = max_size; + text_buf.last_char_was_whitespace = FALSE; + + return text_buf; +} + +__always_inline +static int text_buffer_append_char(text_buffer_t *buf, int c) { + + if (SHOULD_IGNORE_CHAR(c) || c == ' ') { + if (!buf->last_char_was_whitespace && buf->dyn_buffer.cur != 0) { + dyn_buffer_write_char(&buf->dyn_buffer, ' '); + buf->last_char_was_whitespace = TRUE; + + if (buf->max_size > 0 && buf->dyn_buffer.cur >= buf->max_size) { + return TEXT_BUF_FULL; + } + } + } else { + buf->last_char_was_whitespace = FALSE; + grow_buffer_small(&buf->dyn_buffer); + + if (((utf8_int32_t) 0xffffff80 & c) == 0) { + *(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = (char) c; + } else if (((utf8_int32_t) 0xfffff800 & c) == 0) { + *(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0xc0 | (char) (c >> 6); + *(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) (c & 0x3f); + } else if (((utf8_int32_t) 0xffff0000 & c) == 0) { + *(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0xe0 | (char) (c >> 12); + *(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) ((c >> 6) & 0x3f); + *(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) (c & 0x3f); + } else { + *(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0xf0 | (char) (c >> 18); + *(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) ((c >> 12) & 0x3f); + *(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) ((c >> 6) & 0x3f); + *(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) (c & 0x3f); + } + + if (buf->max_size > 0 && buf->dyn_buffer.cur >= buf->max_size) { + return TEXT_BUF_FULL; + } + } + + return 0; +} + + +__always_inline +static void text_buffer_terminate_string(text_buffer_t *buf) { + if (buf->dyn_buffer.cur > 0 && *(buf->dyn_buffer.buf + buf->dyn_buffer.cur - 1) == ' ') { + *(buf->dyn_buffer.buf + buf->dyn_buffer.cur - 1) = '\0'; + } else { + dyn_buffer_write_char(&buf->dyn_buffer, '\0'); + } +} + +#define UTF8_END_OF_STRING \ + (ptr - str >= len || *ptr == 0 || \ + (0xc0 == (0xe0 & *ptr) && ptr - str > len - 2) || \ + (0xe0 == (0xf0 & *ptr) && ptr - str > len - 3) || \ + (0xf0 == (0xf8 & *ptr) && ptr - str > len - 4)) + +__always_inline +static int text_buffer_append_string(text_buffer_t *buf, const char *str, size_t len) { + + const char *ptr = str; + const char *oldPtr = ptr; + + if (str == NULL || UTF8_END_OF_STRING) { + return 0; + } + + if (len <= 4) { + for (int i = 0; i < len; i++) { + if (((utf8_int32_t)0xffffff80 & str[i]) == 0) { + dyn_buffer_write_char(&buf->dyn_buffer, str[i]); + } + } + return 0; + } + + utf8_int32_t c; + char tmp[16]; + + do { + ptr = utf8codepoint(ptr, &c); + *(int *) tmp = 0x00000000; + memcpy(tmp, oldPtr, ptr - oldPtr); + oldPtr = ptr; + + if (!utf8_validchr2(tmp)) { + continue; + } + + int ret = text_buffer_append_char(buf, c); + + if (ret != 0) { + return ret; + } + } while (!UTF8_END_OF_STRING); + + return 0; +} + +__always_inline +static int text_buffer_append_string0(text_buffer_t *buf, char *str) { + return text_buffer_append_string(buf, str, strlen(str)); +} + +#endif diff --git a/third-party/utf8.h b/third-party/utf8.h new file mode 160000 index 0000000..b686b0c --- /dev/null +++ b/third-party/utf8.h @@ -0,0 +1 @@ +Subproject commit b686b0c5181c2dd9f8297e6ac3692c9614b083be diff --git a/third-party/uuid b/third-party/uuid new file mode 160000 index 0000000..f895102 --- /dev/null +++ b/third-party/uuid @@ -0,0 +1 @@ +Subproject commit f895102e2ddaf86387a62c3544abb78c0a5cfbae