mirror of
https://github.com/simon987/libscan.git
synced 2025-04-05 12:23:00 +00:00
text component
This commit is contained in:
commit
a24d4dc538
8
.gitignore
vendored
Normal file
8
.gitignore
vendored
Normal file
@ -0,0 +1,8 @@
|
||||
.idea/
|
||||
cmake_install.cmake
|
||||
Makefile
|
||||
libscan.a
|
||||
libscan.so
|
||||
*.cbp
|
||||
CMakeFiles
|
||||
CMakeCache.txt
|
6
.gitmodules
vendored
Normal file
6
.gitmodules
vendored
Normal file
@ -0,0 +1,6 @@
|
||||
[submodule "third-party/uuid"]
|
||||
path = third-party/uuid
|
||||
url = https://github.com/certik/uuid
|
||||
[submodule "third-party/utf8.h"]
|
||||
path = third-party/utf8.h
|
||||
url = https://github.com/sheredom/utf8.h
|
33
CMakeLists.txt
Normal file
33
CMakeLists.txt
Normal file
@ -0,0 +1,33 @@
|
||||
cmake_minimum_required(VERSION 3.15)
|
||||
project(scan C)
|
||||
|
||||
set(CMAKE_C_STANDARD 11)
|
||||
|
||||
|
||||
add_library(
|
||||
scan
|
||||
src/text.c src/text.h
|
||||
src/util.c src/util.h
|
||||
src/types.h
|
||||
src/macros.h
|
||||
|
||||
third-party/utf8.h
|
||||
)
|
||||
|
||||
|
||||
# Third party
|
||||
add_subdirectory(third-party/uuid/)
|
||||
target_include_directories(
|
||||
scan PRIVATE
|
||||
third-party/uuid/src/
|
||||
)
|
||||
|
||||
target_compile_options(
|
||||
scan PRIVATE
|
||||
-Werror
|
||||
)
|
||||
|
||||
target_link_libraries(
|
||||
scan
|
||||
uuid
|
||||
)
|
13
src/macros.h
Normal file
13
src/macros.h
Normal file
@ -0,0 +1,13 @@
|
||||
#ifndef FALSE
|
||||
#define FALSE (0)
|
||||
#endif
|
||||
|
||||
#ifndef TRUE
|
||||
#define TRUE (!FALSE)
|
||||
#endif
|
||||
|
||||
#undef MAX
|
||||
#define MAX(a, b) (((a) > (b)) ? (a) : (b))
|
||||
|
||||
#undef MIN
|
||||
#define MIN(a, b) (((a) < (b)) ? (a) : (b))
|
29
src/text.c
Normal file
29
src/text.c
Normal file
@ -0,0 +1,29 @@
|
||||
#include "text.h"
|
||||
|
||||
scan_code_t parse_text(scan_text_ctx_t *ctx, struct vfile *f, document_t *doc) {
|
||||
|
||||
int to_read = MIN(ctx->content_size, doc->size);
|
||||
|
||||
char *buf = malloc(to_read);
|
||||
int ret = f->read(f, buf, to_read);
|
||||
if (ret < 0) {
|
||||
//TODO: log
|
||||
return SCAN_ERR_READ;
|
||||
}
|
||||
|
||||
text_buffer_t tex = text_buffer_create(ctx->content_size);
|
||||
text_buffer_append_string(&tex, buf, to_read);
|
||||
text_buffer_terminate_string(&tex);
|
||||
|
||||
meta_line_t *meta = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur);
|
||||
meta->key = MetaContent;
|
||||
strcpy(meta->str_val, tex.dyn_buffer.buf);
|
||||
|
||||
APPEND_META(doc, meta)
|
||||
|
||||
free(buf);
|
||||
text_buffer_destroy(&tex);
|
||||
|
||||
return SCAN_OK;
|
||||
}
|
||||
|
13
src/text.h
Normal file
13
src/text.h
Normal file
@ -0,0 +1,13 @@
|
||||
#ifndef SCAN_TEXT_H
|
||||
#define SCAN_TEXT_H
|
||||
|
||||
#include "types.h"
|
||||
#include "util.h"
|
||||
|
||||
typedef struct {
|
||||
long content_size;
|
||||
} scan_text_ctx_t;
|
||||
|
||||
scan_code_t parse_text(scan_text_ctx_t *ctx, struct vfile *f, document_t *doc);
|
||||
|
||||
#endif
|
124
src/types.h
Normal file
124
src/types.h
Normal file
@ -0,0 +1,124 @@
|
||||
#ifndef SCAN_TYPES_H
|
||||
#define SCAN_TYPES_H
|
||||
|
||||
#include <stdio.h>
|
||||
#include <sys/stat.h>
|
||||
#include "uuid.h"
|
||||
|
||||
|
||||
// TODO
|
||||
#define ARC_MODE_SKIP 0
|
||||
#define ARC_MODE_LIST 1
|
||||
#define ARC_MODE_SHALLOW 2
|
||||
#define ARC_MODE_RECURSE 3
|
||||
typedef int archive_mode_t;
|
||||
|
||||
#define META_INT_MASK 0x80
|
||||
#define META_STR_MASK 0x40
|
||||
#define META_LONG_MASK 0x20
|
||||
|
||||
#define META_STR(id) ((unsigned) id) | ((unsigned) META_STR_MASK)
|
||||
#define META_INT(id) ((unsigned) id) | ((unsigned) META_INT_MASK)
|
||||
#define META_LONG(id) ((unsigned) id) | ((unsigned) META_LONG_MASK)
|
||||
|
||||
#define IS_META_INT(key) (key & META_INT_MASK) == META_INT_MASK
|
||||
#define IS_META_LONG(key) (key & META_LONG_MASK) == META_LONG_MASK
|
||||
#define IS_META_STR(meta) (meta->key & META_STR_MASK) == META_STR_MASK
|
||||
|
||||
|
||||
typedef int scan_code_t;
|
||||
#define SCAN_OK (scan_code_t) 0
|
||||
#define SCAN_ERR_READ (scan_code_t) -1
|
||||
|
||||
// This is written to file as a 16-bit int!
|
||||
enum metakey {
|
||||
MetaContent = META_STR(1),
|
||||
MetaWidth = META_INT(2),
|
||||
MetaHeight = META_INT(3),
|
||||
MetaMediaDuration = META_LONG(4),
|
||||
MetaMediaAudioCodec = META_INT(5),
|
||||
MetaMediaVideoCodec = META_INT(6),
|
||||
MetaMediaBitrate = META_LONG(7),
|
||||
MetaArtist = META_STR(8),
|
||||
MetaAlbum = META_STR(9),
|
||||
MetaAlbumArtist = META_STR(10),
|
||||
MetaGenre = META_STR(11),
|
||||
MetaTitle = META_STR(12),
|
||||
MetaFontName = META_STR(13),
|
||||
MetaParent = META_STR(14),
|
||||
MetaExifMake = META_STR(15),
|
||||
MetaExifSoftware = META_STR(16),
|
||||
MetaExifExposureTime = META_STR(17),
|
||||
MetaExifFNumber = META_STR(18),
|
||||
MetaExifFocalLength = META_STR(19),
|
||||
MetaExifUserComment = META_STR(20),
|
||||
MetaExifModel = META_STR(21),
|
||||
MetaExifIsoSpeedRatings = META_STR(22),
|
||||
MetaExifDateTime = META_STR(23),
|
||||
};
|
||||
|
||||
typedef struct meta_line {
|
||||
struct meta_line *next;
|
||||
enum metakey key;
|
||||
union {
|
||||
char str_val[0];
|
||||
int int_val;
|
||||
unsigned long long_val;
|
||||
};
|
||||
} meta_line_t;
|
||||
|
||||
|
||||
typedef struct document {
|
||||
unsigned char uuid[16];
|
||||
unsigned long ino;
|
||||
unsigned long size;
|
||||
unsigned int mime;
|
||||
int mtime;
|
||||
short base;
|
||||
short ext;
|
||||
meta_line_t *meta_head;
|
||||
meta_line_t *meta_tail;
|
||||
char *filepath;
|
||||
} document_t;
|
||||
|
||||
typedef struct vfile vfile_t;
|
||||
|
||||
__attribute__((warn_unused_result))
|
||||
typedef int (*read_func_t)(struct vfile *, void *buf, size_t size);
|
||||
|
||||
typedef void (*close_func_t)(struct vfile *);
|
||||
|
||||
typedef struct vfile {
|
||||
union {
|
||||
int fd;
|
||||
struct archive *arc;
|
||||
};
|
||||
|
||||
int is_fs_file;
|
||||
char *filepath;
|
||||
|
||||
read_func_t read;
|
||||
close_func_t close;
|
||||
} vfile_t;
|
||||
|
||||
typedef struct parse_job_t {
|
||||
int base;
|
||||
int ext;
|
||||
struct stat info;
|
||||
struct vfile vfile;
|
||||
uuid_t parent;
|
||||
char filepath[1];
|
||||
} parse_job_t;
|
||||
|
||||
|
||||
#define APPEND_META(doc, meta) \
|
||||
meta->next = NULL;\
|
||||
if (doc->meta_head == NULL) {\
|
||||
doc->meta_head = meta;\
|
||||
doc->meta_tail = doc->meta_head;\
|
||||
} else {\
|
||||
doc->meta_tail->next = meta;\
|
||||
doc->meta_tail = meta;\
|
||||
}
|
||||
|
||||
#endif
|
0
src/util.c
Normal file
0
src/util.c
Normal file
283
src/util.h
Normal file
283
src/util.h
Normal file
@ -0,0 +1,283 @@
|
||||
#ifndef SIST2_UTIL_H
|
||||
#define SIST2_UTIL_H
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "../third-party/utf8.h/utf8.h"
|
||||
#include "macros.h"
|
||||
|
||||
#define TEXT_BUF_FULL -1
|
||||
#define INITIAL_BUF_SIZE 1024 * 16
|
||||
|
||||
#define SHOULD_IGNORE_CHAR(c) !(SHOULD_KEEP_CHAR(c))
|
||||
#define SHOULD_KEEP_CHAR(c) ((c >= '\'' && c <= ';') || (c >= 'A' && c <= 'z') || (c > 127))
|
||||
|
||||
|
||||
typedef struct dyn_buffer {
|
||||
char *buf;
|
||||
size_t cur;
|
||||
size_t size;
|
||||
} dyn_buffer_t;
|
||||
|
||||
typedef struct text_buffer {
|
||||
long max_size;
|
||||
int last_char_was_whitespace;
|
||||
dyn_buffer_t dyn_buffer;
|
||||
} text_buffer_t;
|
||||
|
||||
__always_inline
|
||||
static int utf8_validchr2(const char *s) {
|
||||
if (0x00 == (0x80 & *s)) {
|
||||
return TRUE;
|
||||
} else if (0xf0 == (0xf8 & *s)) {
|
||||
if ((0x80 != (0xc0 & s[1])) || (0x80 != (0xc0 & s[2])) ||
|
||||
(0x80 != (0xc0 & s[3]))) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
if (0x80 == (0xc0 & s[4])) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
if ((0 == (0x07 & s[0])) && (0 == (0x30 & s[1]))) {
|
||||
return FALSE;
|
||||
}
|
||||
} else if (0xe0 == (0xf0 & *s)) {
|
||||
if ((0x80 != (0xc0 & s[1])) || (0x80 != (0xc0 & s[2]))) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
if (0x80 == (0xc0 & s[3])) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
if ((0 == (0x0f & s[0])) && (0 == (0x20 & s[1]))) {
|
||||
return FALSE;
|
||||
}
|
||||
} else if (0xc0 == (0xe0 & *s)) {
|
||||
if (0x80 != (0xc0 & s[1])) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
if (0x80 == (0xc0 & s[2])) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
if (0 == (0x1e & s[0])) {
|
||||
return FALSE;
|
||||
}
|
||||
} else {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
|
||||
__always_inline
|
||||
static dyn_buffer_t dyn_buffer_create() {
|
||||
dyn_buffer_t buf;
|
||||
|
||||
buf.size = INITIAL_BUF_SIZE;
|
||||
buf.cur = 0;
|
||||
buf.buf = malloc(INITIAL_BUF_SIZE);
|
||||
|
||||
return buf;
|
||||
}
|
||||
|
||||
__always_inline
|
||||
static void grow_buffer(dyn_buffer_t *buf, size_t size) {
|
||||
if (buf->cur + size > buf->size) {
|
||||
do {
|
||||
buf->size *= 2;
|
||||
} while (buf->cur + size > buf->size);
|
||||
|
||||
buf->buf = realloc(buf->buf, buf->size);
|
||||
}
|
||||
}
|
||||
|
||||
__always_inline
|
||||
static void grow_buffer_small(dyn_buffer_t *buf) {
|
||||
if (buf->cur + sizeof(long) > buf->size) {
|
||||
buf->size *= 2;
|
||||
buf->buf = realloc(buf->buf, buf->size);
|
||||
}
|
||||
}
|
||||
|
||||
__always_inline
|
||||
static void dyn_buffer_write(dyn_buffer_t *buf, const void *data, size_t size) {
|
||||
grow_buffer(buf, size);
|
||||
|
||||
memcpy(buf->buf + buf->cur, data, size);
|
||||
buf->cur += size;
|
||||
}
|
||||
|
||||
__always_inline
|
||||
static void dyn_buffer_write_char(dyn_buffer_t *buf, char c) {
|
||||
grow_buffer_small(buf);
|
||||
|
||||
*(buf->buf + buf->cur) = c;
|
||||
buf->cur += sizeof(c);
|
||||
}
|
||||
|
||||
__always_inline
|
||||
static void dyn_buffer_write_str(dyn_buffer_t *buf, char *str) {
|
||||
dyn_buffer_write(buf, str, strlen(str));
|
||||
dyn_buffer_write_char(buf, '\0');
|
||||
}
|
||||
|
||||
__always_inline
|
||||
static void dyn_buffer_append_string(dyn_buffer_t *buf, char *str) {
|
||||
dyn_buffer_write(buf, str, strlen(str));
|
||||
}
|
||||
|
||||
__always_inline
|
||||
static void dyn_buffer_write_int(dyn_buffer_t *buf, int d) {
|
||||
grow_buffer_small(buf);
|
||||
|
||||
*(int *) (buf->buf + buf->cur) = d;
|
||||
buf->cur += sizeof(int);
|
||||
}
|
||||
|
||||
__always_inline
|
||||
static void dyn_buffer_write_short(dyn_buffer_t *buf, short s) {
|
||||
grow_buffer_small(buf);
|
||||
|
||||
*(short *) (buf->buf + buf->cur) = s;
|
||||
buf->cur += sizeof(short);
|
||||
}
|
||||
|
||||
__always_inline
|
||||
static void dyn_buffer_write_long(dyn_buffer_t *buf, unsigned long l) {
|
||||
grow_buffer_small(buf);
|
||||
|
||||
*(unsigned long *) (buf->buf + buf->cur) = l;
|
||||
buf->cur += sizeof(unsigned long);
|
||||
}
|
||||
|
||||
__always_inline
|
||||
static void dyn_buffer_destroy(dyn_buffer_t *buf) {
|
||||
free(buf->buf);
|
||||
}
|
||||
|
||||
__always_inline
|
||||
static void text_buffer_destroy(text_buffer_t *buf) {
|
||||
dyn_buffer_destroy(&buf->dyn_buffer);
|
||||
}
|
||||
|
||||
__always_inline
|
||||
static text_buffer_t text_buffer_create(long max_size) {
|
||||
text_buffer_t text_buf;
|
||||
|
||||
text_buf.dyn_buffer = dyn_buffer_create();
|
||||
text_buf.max_size = max_size;
|
||||
text_buf.last_char_was_whitespace = FALSE;
|
||||
|
||||
return text_buf;
|
||||
}
|
||||
|
||||
__always_inline
|
||||
static int text_buffer_append_char(text_buffer_t *buf, int c) {
|
||||
|
||||
if (SHOULD_IGNORE_CHAR(c) || c == ' ') {
|
||||
if (!buf->last_char_was_whitespace && buf->dyn_buffer.cur != 0) {
|
||||
dyn_buffer_write_char(&buf->dyn_buffer, ' ');
|
||||
buf->last_char_was_whitespace = TRUE;
|
||||
|
||||
if (buf->max_size > 0 && buf->dyn_buffer.cur >= buf->max_size) {
|
||||
return TEXT_BUF_FULL;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
buf->last_char_was_whitespace = FALSE;
|
||||
grow_buffer_small(&buf->dyn_buffer);
|
||||
|
||||
if (((utf8_int32_t) 0xffffff80 & c) == 0) {
|
||||
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = (char) c;
|
||||
} else if (((utf8_int32_t) 0xfffff800 & c) == 0) {
|
||||
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0xc0 | (char) (c >> 6);
|
||||
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) (c & 0x3f);
|
||||
} else if (((utf8_int32_t) 0xffff0000 & c) == 0) {
|
||||
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0xe0 | (char) (c >> 12);
|
||||
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) ((c >> 6) & 0x3f);
|
||||
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) (c & 0x3f);
|
||||
} else {
|
||||
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0xf0 | (char) (c >> 18);
|
||||
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) ((c >> 12) & 0x3f);
|
||||
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) ((c >> 6) & 0x3f);
|
||||
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) (c & 0x3f);
|
||||
}
|
||||
|
||||
if (buf->max_size > 0 && buf->dyn_buffer.cur >= buf->max_size) {
|
||||
return TEXT_BUF_FULL;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
__always_inline
|
||||
static void text_buffer_terminate_string(text_buffer_t *buf) {
|
||||
if (buf->dyn_buffer.cur > 0 && *(buf->dyn_buffer.buf + buf->dyn_buffer.cur - 1) == ' ') {
|
||||
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur - 1) = '\0';
|
||||
} else {
|
||||
dyn_buffer_write_char(&buf->dyn_buffer, '\0');
|
||||
}
|
||||
}
|
||||
|
||||
#define UTF8_END_OF_STRING \
|
||||
(ptr - str >= len || *ptr == 0 || \
|
||||
(0xc0 == (0xe0 & *ptr) && ptr - str > len - 2) || \
|
||||
(0xe0 == (0xf0 & *ptr) && ptr - str > len - 3) || \
|
||||
(0xf0 == (0xf8 & *ptr) && ptr - str > len - 4))
|
||||
|
||||
__always_inline
|
||||
static int text_buffer_append_string(text_buffer_t *buf, const char *str, size_t len) {
|
||||
|
||||
const char *ptr = str;
|
||||
const char *oldPtr = ptr;
|
||||
|
||||
if (str == NULL || UTF8_END_OF_STRING) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (len <= 4) {
|
||||
for (int i = 0; i < len; i++) {
|
||||
if (((utf8_int32_t)0xffffff80 & str[i]) == 0) {
|
||||
dyn_buffer_write_char(&buf->dyn_buffer, str[i]);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
utf8_int32_t c;
|
||||
char tmp[16];
|
||||
|
||||
do {
|
||||
ptr = utf8codepoint(ptr, &c);
|
||||
*(int *) tmp = 0x00000000;
|
||||
memcpy(tmp, oldPtr, ptr - oldPtr);
|
||||
oldPtr = ptr;
|
||||
|
||||
if (!utf8_validchr2(tmp)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
int ret = text_buffer_append_char(buf, c);
|
||||
|
||||
if (ret != 0) {
|
||||
return ret;
|
||||
}
|
||||
} while (!UTF8_END_OF_STRING);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
__always_inline
|
||||
static int text_buffer_append_string0(text_buffer_t *buf, char *str) {
|
||||
return text_buffer_append_string(buf, str, strlen(str));
|
||||
}
|
||||
|
||||
#endif
|
1
third-party/utf8.h
vendored
Submodule
1
third-party/utf8.h
vendored
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit b686b0c5181c2dd9f8297e6ac3692c9614b083be
|
1
third-party/uuid
vendored
Submodule
1
third-party/uuid
vendored
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit f895102e2ddaf86387a62c3544abb78c0a5cfbae
|
Loading…
x
Reference in New Issue
Block a user