diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3ef7dd8..689837a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,6 +9,7 @@ find_package(Tesseract CONFIG REQUIRED)
 find_package(harfbuzz CONFIG REQUIRED)
 find_package(OpenJPEG CONFIG REQUIRED)
 find_package(JPEG REQUIRED)
+find_package(LibXml2 REQUIRED)
 
 
 include(ExternalProject)
@@ -41,6 +42,7 @@ add_library(
         libscan/arc/arc.c libscan/arc/arc.h
         libscan/ebook/ebook.c libscan/ebook/ebook.h
         libscan/cbr/cbr.c libscan/cbr/cbr.h
+        libscan/ooxml/ooxml.c libscan/ooxml/ooxml.h
 
         third-party/utf8.h
 )
@@ -76,6 +78,7 @@ target_link_libraries(
         ${JPEG_LIBRARIES}
         ${LibArchive_LIBRARIES}
         ${Tesseract_LIBRARIES}
+        ${LIBXML2_LIBRARIES}
 
         ${CMAKE_THREAD_LIBS_INIT}
 
@@ -88,6 +91,7 @@ target_include_directories(
         PRIVATE
         "${CMAKE_SOURCE_DIR}/third-party/ext_mupdf/src/mupdf/include/"
         ${JPEG_INCLUDE_DIR}
+        ${LIBXML2_INCLUDE_DIR}
 )
 
 
diff --git a/README.md b/README.md
index 71ff6e5..4984801 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,7 @@
 
 ```bash
-vcpkg install libarchive pthread tesseract
+vcpkg install libarchive pthread tesseract libxml2
 
-rm -rf CMakeFiles/ CMakeCache.txt
 cmake -DCMAKE_TOOLCHAIN_FILE=/usr/share/vcpkg/scripts/buildsystems/vcpkg.cmake .
 make -j 4
 ```
\ No newline at end of file
diff --git a/libscan/ooxml/ooxml.c b/libscan/ooxml/ooxml.c
new file mode 100644
index 0000000..11beb19
--- /dev/null
+++ b/libscan/ooxml/ooxml.c
@@ -0,0 +1,142 @@
+#include "ooxml.h"
+
+#include "../util.h"
+#include <archive.h>
+#include <archive_entry.h>
+#include <libxml/xmlstring.h>
+#include <libxml/parser.h>
+
+__always_inline
+static int should_read_part(const char *part) {
+
+//    LOG_DEBUGF("ooxml.c", "Got part : %s", part)
+
+    if (part == NULL) {
+        return FALSE;
+    }
+
+    if (    // Word
+            STR_STARTS_WITH(part, "word/document.xml")
+            || STR_STARTS_WITH(part, "word/footnotes.xml")
+            || STR_STARTS_WITH(part, "word/endnotes.xml")
+            || STR_STARTS_WITH(part, "word/footer")
+            || STR_STARTS_WITH(part, "word/header")
+            // PowerPoint
+            || STR_STARTS_WITH(part, "ppt/slides/slide")
+            || STR_STARTS_WITH(part, "ppt/notesSlides/slide")
+            // Excel
+            || STR_STARTS_WITH(part, "xl/worksheets/sheet")
+            || STR_STARTS_WITH(part, "xl/sharedStrings.xml")
+            || STR_STARTS_WITH(part, "xl/workbook.xml")
+            ) {
+        return TRUE;
+    }
+
+    return FALSE;
+}
+
+int extract_text(xmlDoc *xml, xmlNode *node, text_buffer_t *buf) {
+    //TODO: Check which nodes are likely to have a 't' child, and ignore nodes that aren't
+    xmlErrorPtr err = xmlGetLastError();
+    if (err != NULL) {
+        if (err->level == XML_ERR_FATAL) {
+//            LOG_ERRORF("ooxml.c", "Got fatal XML error while parsing document: %s", err->message)
+            return -1;
+        } else {
+//            LOG_ERRORF("ooxml.c", "Got recoverable XML error while parsing document: %s", err->message)
+        }
+    }
+
+    for (xmlNode *child = node; child; child = child->next) {
+        if (*child->name == 't' && *(child->name + 1) == '\0') {
+            xmlChar *text = xmlNodeListGetString(xml, child->xmlChildrenNode, 1);
+
+            if (text) {
+                text_buffer_append_string0(buf, (char *) text);
+                text_buffer_append_char(buf, ' ');
+                xmlFree(text);
+            }
+        }
+
+        extract_text(xml, child->children, buf);
+    }
+    return 0;
+}
+
+int xml_io_read(void *context, char *buffer, int len) {
+    struct archive *a = context;
+    return archive_read_data(a, buffer, len);
+}
+
+int xml_io_close(UNUSED(void *context)) {
+    //noop
+    return 0;
+}
+
+__always_inline
+static int read_part(struct archive *a, text_buffer_t *buf, document_t *doc) {
+
+    xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, a, "/", NULL, XML_PARSE_RECOVER | XML_PARSE_NOWARNING | XML_PARSE_NOERROR | XML_PARSE_NONET);
+
+    if (xml == NULL) {
+//        LOG_ERROR(doc->filepath, "Could not parse XML")
+        return -1;
+    }
+
+    xmlNode *root = xmlDocGetRootElement(xml);
+    if (root == NULL) {
+//        LOG_ERROR(doc->filepath, "Empty document")
+        xmlFreeDoc(xml);
+        return -1;
+    }
+
+    extract_text(xml, root, buf);
+    xmlFreeDoc(xml);
+
+    return 0;
+}
+
+void parse_doc(scan_ooxml_cxt_t *ctx, vfile_t *f, document_t *doc) {
+
+    size_t buf_len;
+    void * buf = read_all(f, &buf_len);
+
+    struct archive *a = archive_read_new();
+    archive_read_support_format_zip(a);
+
+    int ret = archive_read_open_memory(a, buf, buf_len);
+    if (ret != ARCHIVE_OK) {
+//        LOG_ERRORF(doc->filepath, "Could not read archive: %s", archive_error_string(a))
+        archive_read_free(a);
+        return;
+    }
+
+    text_buffer_t tex = text_buffer_create(ctx->content_size);
+
+    struct archive_entry *entry;
+    while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
+        if (S_ISREG(archive_entry_stat(entry)->st_mode)) {
+            const char *path = archive_entry_pathname(entry);
+
+            if (should_read_part(path)) {
+                ret = read_part(a, &tex, doc);
+                if (ret != 0) {
+                    break;
+                }
+            }
+        }
+    }
+
+    if (tex.dyn_buffer.cur > 0) {
+        text_buffer_terminate_string(&tex);
+
+        meta_line_t *meta = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur);
+        meta->key = MetaContent;
+        strcpy(meta->str_val, tex.dyn_buffer.buf);
+        APPEND_META(doc, meta)
+    }
+
+    archive_read_close(a);
+    archive_read_free(a);
+    text_buffer_destroy(&tex);
+}
diff --git a/libscan/ooxml/ooxml.h b/libscan/ooxml/ooxml.h
new file mode 100644
index 0000000..369288c
--- /dev/null
+++ b/libscan/ooxml/ooxml.h
@@ -0,0 +1,13 @@
+#ifndef SCAN_OOXML_H
+#define SCAN_OOXML_H
+
+#include <stdlib.h>
+#include "../scan.h"
+
+typedef struct {
+    long content_size;
+} scan_ooxml_cxt_t;
+
+void parse_doc(scan_ooxml_cxt_t *ctx, vfile_t *f, document_t *doc);
+
+#endif
diff --git a/libscan/scan.h b/libscan/scan.h
index 626ddb8..c0147b8 100644
--- a/libscan/scan.h
+++ b/libscan/scan.h
@@ -5,6 +5,8 @@
 #include <sys/stat.h>
 #include "../third-party/uuid/src/uuid.h"
 
+#include "macros.h"
+
 
 #define META_INT_MASK 0x80
 #define META_STR_MASK 0x40
diff --git a/libscan/util.h b/libscan/util.h
index 05c7a62..6115d46 100644
--- a/libscan/util.h
+++ b/libscan/util.h
@@ -7,6 +7,8 @@
 #include "../third-party/utf8.h/utf8.h"
 #include "macros.h"
 
+#define STR_STARTS_WITH(x, y) (strncmp(y, x, sizeof(y) - 1) == 0)
+
 #define TEXT_BUF_FULL -1
 #define INITIAL_BUF_SIZE 1024 * 16
 
diff --git a/test/main.c b/test/main.c
index 7e9008d..3b31369 100644
--- a/test/main.c
+++ b/test/main.c
@@ -2,6 +2,7 @@
 #include <fcntl.h>
 #include "../libscan/arc/arc.h"
 #include "../libscan/ebook/ebook.h"
+#include "../libscan/ooxml/ooxml.h"
 
 int main() {