wip doc rewrite

This commit is contained in:
simon987 2020-03-05 09:13:37 -05:00
parent 641edf2715
commit d7cbd5d2b6
7 changed files with 113 additions and 90 deletions

View File

@ -81,13 +81,19 @@ target_link_directories(
${UUID_LIBRARY_DIRS} ${UUID_LIBRARY_DIRS}
) )
target_compile_options(sist2 target_compile_options(
sist2
PRIVATE PRIVATE
-Ofast
# -march=native
-fPIC -fPIC
-fno-stack-protector
-fomit-frame-pointer -Ofast
# -fno-stack-protector
# -fomit-frame-pointer
-g
-march=native
-fstack-protector
) )
TARGET_LINK_LIBRARIES( TARGET_LINK_LIBRARIES(

View File

@ -1,8 +1,6 @@
#include "cli.h" #include "cli.h"
#include "ctx.h" #include "ctx.h"
#include <tesseract/capi.h>
#define DEFAULT_OUTPUT "index.sist2/" #define DEFAULT_OUTPUT "index.sist2/"
#define DEFAULT_CONTENT_SIZE 32768 #define DEFAULT_CONTENT_SIZE 32768
#define DEFAULT_QUALITY 5 #define DEFAULT_QUALITY 5

View File

@ -1,12 +1,6 @@
#include "elastic.h" #include "elastic.h"
#include "src/ctx.h" #include "src/ctx.h"
#include <stdlib.h>
#include "web.h"
#include <stdio.h>
#include <string.h>
#include <cJSON/cJSON.h>
#include "static_generated.c" #include "static_generated.c"

View File

@ -15,8 +15,7 @@ store_t *store_create(char *path) {
); );
if (open_ret != 0) { if (open_ret != 0) {
fprintf(stderr, "Error while opening store: %s (%s)\n", mdb_strerror(open_ret), path); LOG_FATALF("store.c", "Error while opening store: %s (%s)\n", mdb_strerror(open_ret), path)
exit(1);
} }
store->size = (size_t) 1024 * 1024 * 5; store->size = (size_t) 1024 * 1024 * 5;
@ -42,6 +41,9 @@ void store_destroy(store_t *store) {
void store_write(store_t *store, char *key, size_t key_len, char *buf, size_t buf_len) { void store_write(store_t *store, char *key, size_t key_len, char *buf, size_t buf_len) {
// TODO: DEBUG
return;
if (LogCtx.very_verbose) { if (LogCtx.very_verbose) {
char uuid_str[UUID_STR_LEN]; char uuid_str[UUID_STR_LEN];
uuid_unparse((unsigned char *) key, uuid_str); uuid_unparse((unsigned char *) key, uuid_str);
@ -82,7 +84,7 @@ void store_write(store_t *store, char *key, size_t key_len, char *buf, size_t bu
pthread_rwlock_unlock(&store->lock); pthread_rwlock_unlock(&store->lock);
if (put_ret != 0) { if (put_ret != 0) {
printf("%s\n", mdb_strerror(put_ret)); LOG_ERROR("store.c", mdb_strerror(put_ret))
} }
} }

View File

@ -1,5 +1,5 @@
#import "cbr.h" #include "cbr.h"
#import "src/ctx.h" #include "src/ctx.h"
unsigned int cbr_mime; unsigned int cbr_mime;
unsigned int cbz_mime; unsigned int cbz_mime;

View File

@ -1,46 +1,10 @@
#include "doc.h" #include "doc.h"
#include "src/ctx.h" #include "src/ctx.h"
int dump_text(mceTextReader_t *reader, dyn_buffer_t *buf) {
mce_skip_attributes(reader);
xmlErrorPtr err = xmlGetLastError();
if (err != NULL) {
if (err->level == XML_ERR_FATAL) {
LOG_ERRORF("doc.c", "Got fatal XML error while parsing document: %s", err->message)
return -1;
} else {
LOG_ERRORF("doc.c", "Got recoverable XML error while parsing document: %s", err->message)
}
}
mce_start_children(reader) {
mce_start_element(reader, NULL, _X("t")) {
mce_skip_attributes(reader);
mce_start_children(reader) {
mce_start_text(reader) {
char *str = (char *) xmlTextReaderConstValue(reader->reader);
dyn_buffer_append_string(buf, str);
dyn_buffer_write_char(buf, ' ');
} mce_end_text(reader);
} mce_end_children(reader);
} mce_end_element(reader);
mce_start_element(reader, NULL, NULL) {
int ret = dump_text(reader, buf);
if (ret != 0) {
return ret;
}
} mce_end_element(reader);
} mce_end_children(reader)
return 0;
}
__always_inline __always_inline
int should_read_part(opcPart part) { static int should_read_part(char *part) {
LOG_DEBUGF("doc.c", "Got part : %s", part)
char *part_name = (char *) part; char *part_name = (char *) part;
if (part == NULL) { if (part == NULL) {
@ -65,56 +29,115 @@ int should_read_part(opcPart part) {
return FALSE; return FALSE;
} }
__always_inline typedef int (XMLCALL *xmlInputReadCallback)(void *context, char *buffer, int len);
int read_part(opcContainer *c, dyn_buffer_t *buf, opcPart part, document_t *doc) {
mceTextReader_t reader; typedef struct {
int ret = opcXmlReaderOpen(c, &reader, part, NULL, "UTF-8", XML_PARSE_NOWARNING | XML_PARSE_NOERROR | XML_PARSE_NONET); struct archive *a;
} xml_io_ctx;
if (ret != OPC_ERROR_NONE) { int xml_io_read(void *context, char *buffer, int len) {
LOG_ERRORF(doc->filepath, "(doc.c) opcXmlReaderOpen() returned error code %d", ret); xml_io_ctx *ctx = context;
return -1;
//TODO: return value ?
return archive_read_data(ctx->a, buffer, len);
} }
mce_start_document(&reader) { int xml_io_close(void *context) {
mce_start_element(&reader, NULL, NULL) { //noop
ret = dump_text(&reader, buf);
if (ret != 0) {
mceTextReaderCleanup(&reader);
return -1;
}
} mce_end_element(&reader);
} mce_end_document(&reader);
mceTextReaderCleanup(&reader);
return 0; return 0;
} }
__always_inline
static int read_part(struct archive *a, dyn_buffer_t *buf, document_t *doc) {
xmlNode *root, *first_child, *node1, *node2, *node3, *node4;
xml_io_ctx ctx = {a};
/* do actual parsing of document */
xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, &ctx, "/", NULL, 0);
/* error checking! */
if (xml == NULL) {
fprintf(stderr, "Document not parsed successfully. \n");
return -1;
}
root = xmlDocGetRootElement(xml);
if (root == NULL) {
fprintf(stderr, "empty document\n");
xmlFreeDoc(xml);
return -1;
}
if (xmlStrcmp(root->name, (const xmlChar *) "document") != 0) {
fprintf(stderr, "document of the wrong type, root node != document");
xmlFreeDoc(xml);
return -1;
}
/* init a few more variables */
xmlChar *key;
first_child = root->children;
for (node1 = first_child; node1; node1 = node1->next) {
if ((xmlStrcmp(node1->name, (const xmlChar *) "body")) == 0) {
for (node2 = node1->children; node2; node2 = node2->next) {
if ((xmlStrcmp(node2->name, (const xmlChar *) "p")) == 0) {
dyn_buffer_write_char(buf, ' ');
for (node3 = node2->children; node3; node3 = node3->next) {
if ((xmlStrcmp(node3->name, (const xmlChar *) "r")) == 0) {
for (node4 = node3->children; node4; node4 = node4->next) {
if ((!xmlStrcmp(node4->name, (const xmlChar *) "t"))) {
key = xmlNodeListGetString(xml, node4->xmlChildrenNode, 1);
dyn_buffer_append_string(buf, (char *) key);
dyn_buffer_write_char(buf, ' ');
}
}
}
}
}
}
}
}
}
void parse_doc(void *mem, size_t mem_len, document_t *doc) { void parse_doc(void *mem, size_t mem_len, document_t *doc) {
if (mem == NULL) { if (mem == NULL) {
return; return;
} }
opcContainer *c = opcContainerOpenMem(mem, mem_len, OPC_OPEN_READ_ONLY, NULL); struct archive *a = archive_read_new();
if (c == NULL) { archive_read_support_format_zip(a);
LOG_ERROR(doc->filepath, "(doc.c) Couldn't open document with opcContainerOpenMem()");
int ret = archive_read_open_memory(a, mem, mem_len);
if (ret != ARCHIVE_OK) {
LOG_ERRORF(doc->filepath, "Could not read archive: %s", archive_error_string(a));
archive_read_free(a);
return; return;
} }
dyn_buffer_t buf = dyn_buffer_create(); dyn_buffer_t buf = dyn_buffer_create();
opcPart part = opcPartGetFirst(c); struct archive_entry *entry;
do { while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
if (should_read_part(part)) { if (S_ISREG(archive_entry_stat(entry)->st_mode)) {
int ret = read_part(c, &buf, part, doc); char *path = (char *) archive_entry_pathname(entry);
if (should_read_part(path)) {
ret = read_part(a, &buf, doc);
if (ret != 0) { if (ret != 0) {
break; break;
} }
} }
} while ((part = opcPartGetNext(c, part)));
opcContainerClose(c, OPC_CLOSE_NOW); }
}
// close
if (buf.cur > 0) { if (buf.cur > 0) {
dyn_buffer_write_char(&buf, '\0'); dyn_buffer_write_char(&buf, '\0');

File diff suppressed because one or more lines are too long