mirror of
https://github.com/simon987/sist2.git
synced 2025-04-19 18:26:43 +00:00
wip doc rewrite
This commit is contained in:
parent
641edf2715
commit
d7cbd5d2b6
@ -81,14 +81,20 @@ target_link_directories(
|
|||||||
${UUID_LIBRARY_DIRS}
|
${UUID_LIBRARY_DIRS}
|
||||||
)
|
)
|
||||||
|
|
||||||
target_compile_options(sist2
|
target_compile_options(
|
||||||
|
sist2
|
||||||
PRIVATE
|
PRIVATE
|
||||||
-Ofast
|
|
||||||
# -march=native
|
|
||||||
-fPIC
|
-fPIC
|
||||||
-fno-stack-protector
|
|
||||||
-fomit-frame-pointer
|
-Ofast
|
||||||
)
|
# -fno-stack-protector
|
||||||
|
# -fomit-frame-pointer
|
||||||
|
|
||||||
|
-g
|
||||||
|
-march=native
|
||||||
|
-fstack-protector
|
||||||
|
)
|
||||||
|
|
||||||
TARGET_LINK_LIBRARIES(
|
TARGET_LINK_LIBRARIES(
|
||||||
sist2
|
sist2
|
||||||
|
@ -1,8 +1,6 @@
|
|||||||
#include "cli.h"
|
#include "cli.h"
|
||||||
#include "ctx.h"
|
#include "ctx.h"
|
||||||
|
|
||||||
#include <tesseract/capi.h>
|
|
||||||
|
|
||||||
#define DEFAULT_OUTPUT "index.sist2/"
|
#define DEFAULT_OUTPUT "index.sist2/"
|
||||||
#define DEFAULT_CONTENT_SIZE 32768
|
#define DEFAULT_CONTENT_SIZE 32768
|
||||||
#define DEFAULT_QUALITY 5
|
#define DEFAULT_QUALITY 5
|
||||||
|
@ -1,12 +1,6 @@
|
|||||||
#include "elastic.h"
|
#include "elastic.h"
|
||||||
#include "src/ctx.h"
|
#include "src/ctx.h"
|
||||||
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include "web.h"
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <cJSON/cJSON.h>
|
|
||||||
|
|
||||||
#include "static_generated.c"
|
#include "static_generated.c"
|
||||||
|
|
||||||
|
|
||||||
|
@ -15,8 +15,7 @@ store_t *store_create(char *path) {
|
|||||||
);
|
);
|
||||||
|
|
||||||
if (open_ret != 0) {
|
if (open_ret != 0) {
|
||||||
fprintf(stderr, "Error while opening store: %s (%s)\n", mdb_strerror(open_ret), path);
|
LOG_FATALF("store.c", "Error while opening store: %s (%s)\n", mdb_strerror(open_ret), path)
|
||||||
exit(1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
store->size = (size_t) 1024 * 1024 * 5;
|
store->size = (size_t) 1024 * 1024 * 5;
|
||||||
@ -42,6 +41,9 @@ void store_destroy(store_t *store) {
|
|||||||
|
|
||||||
void store_write(store_t *store, char *key, size_t key_len, char *buf, size_t buf_len) {
|
void store_write(store_t *store, char *key, size_t key_len, char *buf, size_t buf_len) {
|
||||||
|
|
||||||
|
// TODO: DEBUG
|
||||||
|
return;
|
||||||
|
|
||||||
if (LogCtx.very_verbose) {
|
if (LogCtx.very_verbose) {
|
||||||
char uuid_str[UUID_STR_LEN];
|
char uuid_str[UUID_STR_LEN];
|
||||||
uuid_unparse((unsigned char *) key, uuid_str);
|
uuid_unparse((unsigned char *) key, uuid_str);
|
||||||
@ -82,7 +84,7 @@ void store_write(store_t *store, char *key, size_t key_len, char *buf, size_t bu
|
|||||||
pthread_rwlock_unlock(&store->lock);
|
pthread_rwlock_unlock(&store->lock);
|
||||||
|
|
||||||
if (put_ret != 0) {
|
if (put_ret != 0) {
|
||||||
printf("%s\n", mdb_strerror(put_ret));
|
LOG_ERROR("store.c", mdb_strerror(put_ret))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#import "cbr.h"
|
#include "cbr.h"
|
||||||
#import "src/ctx.h"
|
#include "src/ctx.h"
|
||||||
|
|
||||||
unsigned int cbr_mime;
|
unsigned int cbr_mime;
|
||||||
unsigned int cbz_mime;
|
unsigned int cbz_mime;
|
||||||
|
@ -1,46 +1,10 @@
|
|||||||
#include "doc.h"
|
#include "doc.h"
|
||||||
#include "src/ctx.h"
|
#include "src/ctx.h"
|
||||||
|
|
||||||
int dump_text(mceTextReader_t *reader, dyn_buffer_t *buf) {
|
|
||||||
|
|
||||||
mce_skip_attributes(reader);
|
|
||||||
|
|
||||||
xmlErrorPtr err = xmlGetLastError();
|
|
||||||
if (err != NULL) {
|
|
||||||
if (err->level == XML_ERR_FATAL) {
|
|
||||||
LOG_ERRORF("doc.c", "Got fatal XML error while parsing document: %s", err->message)
|
|
||||||
return -1;
|
|
||||||
} else {
|
|
||||||
LOG_ERRORF("doc.c", "Got recoverable XML error while parsing document: %s", err->message)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
mce_start_children(reader) {
|
|
||||||
mce_start_element(reader, NULL, _X("t")) {
|
|
||||||
mce_skip_attributes(reader);
|
|
||||||
mce_start_children(reader) {
|
|
||||||
mce_start_text(reader) {
|
|
||||||
char *str = (char *) xmlTextReaderConstValue(reader->reader);
|
|
||||||
dyn_buffer_append_string(buf, str);
|
|
||||||
dyn_buffer_write_char(buf, ' ');
|
|
||||||
} mce_end_text(reader);
|
|
||||||
} mce_end_children(reader);
|
|
||||||
} mce_end_element(reader);
|
|
||||||
|
|
||||||
mce_start_element(reader, NULL, NULL) {
|
|
||||||
int ret = dump_text(reader, buf);
|
|
||||||
if (ret != 0) {
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
} mce_end_element(reader);
|
|
||||||
|
|
||||||
} mce_end_children(reader)
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
__always_inline
|
__always_inline
|
||||||
int should_read_part(opcPart part) {
|
static int should_read_part(char *part) {
|
||||||
|
|
||||||
|
LOG_DEBUGF("doc.c", "Got part : %s", part)
|
||||||
char *part_name = (char *) part;
|
char *part_name = (char *) part;
|
||||||
|
|
||||||
if (part == NULL) {
|
if (part == NULL) {
|
||||||
@ -65,56 +29,115 @@ int should_read_part(opcPart part) {
|
|||||||
return FALSE;
|
return FALSE;
|
||||||
}
|
}
|
||||||
|
|
||||||
__always_inline
|
typedef int (XMLCALL *xmlInputReadCallback)(void *context, char *buffer, int len);
|
||||||
int read_part(opcContainer *c, dyn_buffer_t *buf, opcPart part, document_t *doc) {
|
|
||||||
|
|
||||||
mceTextReader_t reader;
|
typedef struct {
|
||||||
int ret = opcXmlReaderOpen(c, &reader, part, NULL, "UTF-8", XML_PARSE_NOWARNING | XML_PARSE_NOERROR | XML_PARSE_NONET);
|
struct archive *a;
|
||||||
|
} xml_io_ctx;
|
||||||
|
|
||||||
if (ret != OPC_ERROR_NONE) {
|
int xml_io_read(void *context, char *buffer, int len) {
|
||||||
LOG_ERRORF(doc->filepath, "(doc.c) opcXmlReaderOpen() returned error code %d", ret);
|
xml_io_ctx *ctx = context;
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
mce_start_document(&reader) {
|
//TODO: return value ?
|
||||||
mce_start_element(&reader, NULL, NULL) {
|
return archive_read_data(ctx->a, buffer, len);
|
||||||
ret = dump_text(&reader, buf);
|
}
|
||||||
if (ret != 0) {
|
|
||||||
mceTextReaderCleanup(&reader);
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
} mce_end_element(&reader);
|
|
||||||
} mce_end_document(&reader);
|
|
||||||
|
|
||||||
mceTextReaderCleanup(&reader);
|
int xml_io_close(void *context) {
|
||||||
|
//noop
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__always_inline
|
||||||
|
static int read_part(struct archive *a, dyn_buffer_t *buf, document_t *doc) {
|
||||||
|
|
||||||
|
xmlNode *root, *first_child, *node1, *node2, *node3, *node4;
|
||||||
|
|
||||||
|
xml_io_ctx ctx = {a};
|
||||||
|
|
||||||
|
/* do actual parsing of document */
|
||||||
|
xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, &ctx, "/", NULL, 0);
|
||||||
|
|
||||||
|
/* error checking! */
|
||||||
|
if (xml == NULL) {
|
||||||
|
fprintf(stderr, "Document not parsed successfully. \n");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
root = xmlDocGetRootElement(xml);
|
||||||
|
if (root == NULL) {
|
||||||
|
fprintf(stderr, "empty document\n");
|
||||||
|
xmlFreeDoc(xml);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
if (xmlStrcmp(root->name, (const xmlChar *) "document") != 0) {
|
||||||
|
fprintf(stderr, "document of the wrong type, root node != document");
|
||||||
|
xmlFreeDoc(xml);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* init a few more variables */
|
||||||
|
xmlChar *key;
|
||||||
|
|
||||||
|
first_child = root->children;
|
||||||
|
for (node1 = first_child; node1; node1 = node1->next) {
|
||||||
|
if ((xmlStrcmp(node1->name, (const xmlChar *) "body")) == 0) {
|
||||||
|
for (node2 = node1->children; node2; node2 = node2->next) {
|
||||||
|
if ((xmlStrcmp(node2->name, (const xmlChar *) "p")) == 0) {
|
||||||
|
|
||||||
|
dyn_buffer_write_char(buf, ' ');
|
||||||
|
|
||||||
|
for (node3 = node2->children; node3; node3 = node3->next) {
|
||||||
|
if ((xmlStrcmp(node3->name, (const xmlChar *) "r")) == 0) {
|
||||||
|
for (node4 = node3->children; node4; node4 = node4->next) {
|
||||||
|
if ((!xmlStrcmp(node4->name, (const xmlChar *) "t"))) {
|
||||||
|
key = xmlNodeListGetString(xml, node4->xmlChildrenNode, 1);
|
||||||
|
|
||||||
|
dyn_buffer_append_string(buf, (char *) key);
|
||||||
|
dyn_buffer_write_char(buf, ' ');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void parse_doc(void *mem, size_t mem_len, document_t *doc) {
|
void parse_doc(void *mem, size_t mem_len, document_t *doc) {
|
||||||
|
|
||||||
if (mem == NULL) {
|
if (mem == NULL) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
opcContainer *c = opcContainerOpenMem(mem, mem_len, OPC_OPEN_READ_ONLY, NULL);
|
struct archive *a = archive_read_new();
|
||||||
if (c == NULL) {
|
archive_read_support_format_zip(a);
|
||||||
LOG_ERROR(doc->filepath, "(doc.c) Couldn't open document with opcContainerOpenMem()");
|
|
||||||
|
int ret = archive_read_open_memory(a, mem, mem_len);
|
||||||
|
if (ret != ARCHIVE_OK) {
|
||||||
|
LOG_ERRORF(doc->filepath, "Could not read archive: %s", archive_error_string(a));
|
||||||
|
archive_read_free(a);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
dyn_buffer_t buf = dyn_buffer_create();
|
dyn_buffer_t buf = dyn_buffer_create();
|
||||||
|
|
||||||
opcPart part = opcPartGetFirst(c);
|
struct archive_entry *entry;
|
||||||
do {
|
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
|
||||||
if (should_read_part(part)) {
|
if (S_ISREG(archive_entry_stat(entry)->st_mode)) {
|
||||||
int ret = read_part(c, &buf, part, doc);
|
char *path = (char *) archive_entry_pathname(entry);
|
||||||
|
|
||||||
|
if (should_read_part(path)) {
|
||||||
|
ret = read_part(a, &buf, doc);
|
||||||
if (ret != 0) {
|
if (ret != 0) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} while ((part = opcPartGetNext(c, part)));
|
|
||||||
|
|
||||||
opcContainerClose(c, OPC_CLOSE_NOW);
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// close
|
||||||
|
|
||||||
if (buf.cur > 0) {
|
if (buf.cur > 0) {
|
||||||
dyn_buffer_write_char(&buf, '\0');
|
dyn_buffer_write_char(&buf, '\0');
|
||||||
|
File diff suppressed because one or more lines are too long
Loading…
x
Reference in New Issue
Block a user