mirror of
https://github.com/simon987/sist2.git
synced 2025-04-10 14:06:45 +00:00
wip doc rewrite
This commit is contained in:
parent
641edf2715
commit
d7cbd5d2b6
@ -81,14 +81,20 @@ target_link_directories(
|
||||
${UUID_LIBRARY_DIRS}
|
||||
)
|
||||
|
||||
target_compile_options(sist2
|
||||
target_compile_options(
|
||||
sist2
|
||||
PRIVATE
|
||||
-Ofast
|
||||
# -march=native
|
||||
|
||||
-fPIC
|
||||
-fno-stack-protector
|
||||
-fomit-frame-pointer
|
||||
)
|
||||
|
||||
-Ofast
|
||||
# -fno-stack-protector
|
||||
# -fomit-frame-pointer
|
||||
|
||||
-g
|
||||
-march=native
|
||||
-fstack-protector
|
||||
)
|
||||
|
||||
TARGET_LINK_LIBRARIES(
|
||||
sist2
|
||||
|
@ -1,8 +1,6 @@
|
||||
#include "cli.h"
|
||||
#include "ctx.h"
|
||||
|
||||
#include <tesseract/capi.h>
|
||||
|
||||
#define DEFAULT_OUTPUT "index.sist2/"
|
||||
#define DEFAULT_CONTENT_SIZE 32768
|
||||
#define DEFAULT_QUALITY 5
|
||||
|
@ -1,12 +1,6 @@
|
||||
#include "elastic.h"
|
||||
#include "src/ctx.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include "web.h"
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <cJSON/cJSON.h>
|
||||
|
||||
#include "static_generated.c"
|
||||
|
||||
|
||||
|
@ -15,8 +15,7 @@ store_t *store_create(char *path) {
|
||||
);
|
||||
|
||||
if (open_ret != 0) {
|
||||
fprintf(stderr, "Error while opening store: %s (%s)\n", mdb_strerror(open_ret), path);
|
||||
exit(1);
|
||||
LOG_FATALF("store.c", "Error while opening store: %s (%s)\n", mdb_strerror(open_ret), path)
|
||||
}
|
||||
|
||||
store->size = (size_t) 1024 * 1024 * 5;
|
||||
@ -42,6 +41,9 @@ void store_destroy(store_t *store) {
|
||||
|
||||
void store_write(store_t *store, char *key, size_t key_len, char *buf, size_t buf_len) {
|
||||
|
||||
// TODO: DEBUG
|
||||
return;
|
||||
|
||||
if (LogCtx.very_verbose) {
|
||||
char uuid_str[UUID_STR_LEN];
|
||||
uuid_unparse((unsigned char *) key, uuid_str);
|
||||
@ -82,7 +84,7 @@ void store_write(store_t *store, char *key, size_t key_len, char *buf, size_t bu
|
||||
pthread_rwlock_unlock(&store->lock);
|
||||
|
||||
if (put_ret != 0) {
|
||||
printf("%s\n", mdb_strerror(put_ret));
|
||||
LOG_ERROR("store.c", mdb_strerror(put_ret))
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
#import "cbr.h"
|
||||
#import "src/ctx.h"
|
||||
#include "cbr.h"
|
||||
#include "src/ctx.h"
|
||||
|
||||
unsigned int cbr_mime;
|
||||
unsigned int cbz_mime;
|
||||
|
@ -1,46 +1,10 @@
|
||||
#include "doc.h"
|
||||
#include "src/ctx.h"
|
||||
|
||||
int dump_text(mceTextReader_t *reader, dyn_buffer_t *buf) {
|
||||
|
||||
mce_skip_attributes(reader);
|
||||
|
||||
xmlErrorPtr err = xmlGetLastError();
|
||||
if (err != NULL) {
|
||||
if (err->level == XML_ERR_FATAL) {
|
||||
LOG_ERRORF("doc.c", "Got fatal XML error while parsing document: %s", err->message)
|
||||
return -1;
|
||||
} else {
|
||||
LOG_ERRORF("doc.c", "Got recoverable XML error while parsing document: %s", err->message)
|
||||
}
|
||||
}
|
||||
|
||||
mce_start_children(reader) {
|
||||
mce_start_element(reader, NULL, _X("t")) {
|
||||
mce_skip_attributes(reader);
|
||||
mce_start_children(reader) {
|
||||
mce_start_text(reader) {
|
||||
char *str = (char *) xmlTextReaderConstValue(reader->reader);
|
||||
dyn_buffer_append_string(buf, str);
|
||||
dyn_buffer_write_char(buf, ' ');
|
||||
} mce_end_text(reader);
|
||||
} mce_end_children(reader);
|
||||
} mce_end_element(reader);
|
||||
|
||||
mce_start_element(reader, NULL, NULL) {
|
||||
int ret = dump_text(reader, buf);
|
||||
if (ret != 0) {
|
||||
return ret;
|
||||
}
|
||||
} mce_end_element(reader);
|
||||
|
||||
} mce_end_children(reader)
|
||||
return 0;
|
||||
}
|
||||
|
||||
__always_inline
|
||||
int should_read_part(opcPart part) {
|
||||
static int should_read_part(char *part) {
|
||||
|
||||
LOG_DEBUGF("doc.c", "Got part : %s", part)
|
||||
char *part_name = (char *) part;
|
||||
|
||||
if (part == NULL) {
|
||||
@ -65,29 +29,78 @@ int should_read_part(opcPart part) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
typedef int (XMLCALL *xmlInputReadCallback)(void *context, char *buffer, int len);
|
||||
|
||||
typedef struct {
|
||||
struct archive *a;
|
||||
} xml_io_ctx;
|
||||
|
||||
int xml_io_read(void *context, char *buffer, int len) {
|
||||
xml_io_ctx *ctx = context;
|
||||
|
||||
//TODO: return value ?
|
||||
return archive_read_data(ctx->a, buffer, len);
|
||||
}
|
||||
|
||||
int xml_io_close(void *context) {
|
||||
//noop
|
||||
return 0;
|
||||
}
|
||||
|
||||
__always_inline
|
||||
int read_part(opcContainer *c, dyn_buffer_t *buf, opcPart part, document_t *doc) {
|
||||
static int read_part(struct archive *a, dyn_buffer_t *buf, document_t *doc) {
|
||||
|
||||
mceTextReader_t reader;
|
||||
int ret = opcXmlReaderOpen(c, &reader, part, NULL, "UTF-8", XML_PARSE_NOWARNING | XML_PARSE_NOERROR | XML_PARSE_NONET);
|
||||
xmlNode *root, *first_child, *node1, *node2, *node3, *node4;
|
||||
|
||||
if (ret != OPC_ERROR_NONE) {
|
||||
LOG_ERRORF(doc->filepath, "(doc.c) opcXmlReaderOpen() returned error code %d", ret);
|
||||
xml_io_ctx ctx = {a};
|
||||
|
||||
/* do actual parsing of document */
|
||||
xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, &ctx, "/", NULL, 0);
|
||||
|
||||
/* error checking! */
|
||||
if (xml == NULL) {
|
||||
fprintf(stderr, "Document not parsed successfully. \n");
|
||||
return -1;
|
||||
}
|
||||
root = xmlDocGetRootElement(xml);
|
||||
if (root == NULL) {
|
||||
fprintf(stderr, "empty document\n");
|
||||
xmlFreeDoc(xml);
|
||||
return -1;
|
||||
}
|
||||
if (xmlStrcmp(root->name, (const xmlChar *) "document") != 0) {
|
||||
fprintf(stderr, "document of the wrong type, root node != document");
|
||||
xmlFreeDoc(xml);
|
||||
return -1;
|
||||
}
|
||||
|
||||
mce_start_document(&reader) {
|
||||
mce_start_element(&reader, NULL, NULL) {
|
||||
ret = dump_text(&reader, buf);
|
||||
if (ret != 0) {
|
||||
mceTextReaderCleanup(&reader);
|
||||
return -1;
|
||||
}
|
||||
} mce_end_element(&reader);
|
||||
} mce_end_document(&reader);
|
||||
/* init a few more variables */
|
||||
xmlChar *key;
|
||||
|
||||
mceTextReaderCleanup(&reader);
|
||||
return 0;
|
||||
first_child = root->children;
|
||||
for (node1 = first_child; node1; node1 = node1->next) {
|
||||
if ((xmlStrcmp(node1->name, (const xmlChar *) "body")) == 0) {
|
||||
for (node2 = node1->children; node2; node2 = node2->next) {
|
||||
if ((xmlStrcmp(node2->name, (const xmlChar *) "p")) == 0) {
|
||||
|
||||
dyn_buffer_write_char(buf, ' ');
|
||||
|
||||
for (node3 = node2->children; node3; node3 = node3->next) {
|
||||
if ((xmlStrcmp(node3->name, (const xmlChar *) "r")) == 0) {
|
||||
for (node4 = node3->children; node4; node4 = node4->next) {
|
||||
if ((!xmlStrcmp(node4->name, (const xmlChar *) "t"))) {
|
||||
key = xmlNodeListGetString(xml, node4->xmlChildrenNode, 1);
|
||||
|
||||
dyn_buffer_append_string(buf, (char *) key);
|
||||
dyn_buffer_write_char(buf, ' ');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void parse_doc(void *mem, size_t mem_len, document_t *doc) {
|
||||
@ -96,25 +109,35 @@ void parse_doc(void *mem, size_t mem_len, document_t *doc) {
|
||||
return;
|
||||
}
|
||||
|
||||
opcContainer *c = opcContainerOpenMem(mem, mem_len, OPC_OPEN_READ_ONLY, NULL);
|
||||
if (c == NULL) {
|
||||
LOG_ERROR(doc->filepath, "(doc.c) Couldn't open document with opcContainerOpenMem()");
|
||||
struct archive *a = archive_read_new();
|
||||
archive_read_support_format_zip(a);
|
||||
|
||||
int ret = archive_read_open_memory(a, mem, mem_len);
|
||||
if (ret != ARCHIVE_OK) {
|
||||
LOG_ERRORF(doc->filepath, "Could not read archive: %s", archive_error_string(a));
|
||||
archive_read_free(a);
|
||||
return;
|
||||
}
|
||||
|
||||
dyn_buffer_t buf = dyn_buffer_create();
|
||||
|
||||
opcPart part = opcPartGetFirst(c);
|
||||
do {
|
||||
if (should_read_part(part)) {
|
||||
int ret = read_part(c, &buf, part, doc);
|
||||
if (ret != 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} while ((part = opcPartGetNext(c, part)));
|
||||
struct archive_entry *entry;
|
||||
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
|
||||
if (S_ISREG(archive_entry_stat(entry)->st_mode)) {
|
||||
char *path = (char *) archive_entry_pathname(entry);
|
||||
|
||||
opcContainerClose(c, OPC_CLOSE_NOW);
|
||||
if (should_read_part(path)) {
|
||||
ret = read_part(a, &buf, doc);
|
||||
if (ret != 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// close
|
||||
|
||||
if (buf.cur > 0) {
|
||||
dyn_buffer_write_char(&buf, '\0');
|
||||
|
File diff suppressed because one or more lines are too long
Loading…
x
Reference in New Issue
Block a user