Replace docx/pdf/spreadsheet parsers with Tika

2025-12-08 12:54:30 +00:00 · 2019-06-13 15:30:50 -04:00 · 2019-06-13 15:30:50 -04:00 · 25ab9dd9c7
commit 25ab9dd9c7
parent 980babc5cc
9 changed files with 67 additions and 220 deletions
--- a/common.py
+++ b/common.py
@ -0,0 +1,10 @@
+import os
+
+if not os.path.exists("tika"):
+    os.mkdir("tika")
+os.putenv("TIKA_PATH", os.path.join(__name__, "tika/"))
+os.putenv("TIKA_LOG_PATH", os.path.join(__name__, "tika/"))
+
+from tika import parser as tika, config
+
+config.getMimeTypes()
--- a/config.py
+++ b/config.py
@ -3,14 +3,11 @@ default_options = {
    "ThumbnailQuality": "85",
    "ThumbnailSize": "272",
    "ThumbnailColor": "FF00FF",
-    "TextFileContentLength": "2000",
-    "PdfFileContentLength": "2000",
-    "DocxContentLength": "2000",
-    "SpreadSheetContentLength": "2000",
-    "EbookContentLength": "2000",
+    "ContentLength": "4096",
+    "TextFileContentLength": "4096",
    "MimeGuesser": "extension",  # extension, content
    "CheckSumCalculators": "",  # md5, sha1, sha256
-    "FileParsers": "media, text, picture, font, pdf, docx, spreadsheet, ebook"
+    "FileParsers": "media, text, picture, font, tika"
 }

 # Index documents after every X parsed files (Larger number will use more memory)
--- a/crawler.py
+++ b/crawler.py
@ -11,7 +11,7 @@ import config
 from indexer import Indexer
 from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \
    PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser, \
-    PdfFileParser, DocxParser, EbookParser, SpreadSheetParser
+    TikaFileParser
 from search import Search
 from storage import Directory
 from storage import Task, LocalStorage
@ -226,16 +226,8 @@ class TaskManager:
            parsers.append(PictureFileParser(chksum_calcs, directory.path))
        if "font" in p:
            parsers.append(FontParser(chksum_calcs, directory.path))
-        if "pdf" in p:
-            parsers.append(
-                PdfFileParser(chksum_calcs, int(directory.get_option("PdfFileContentLength")), directory.path))
-        if "docx" in p:
-            parsers.append(DocxParser(chksum_calcs, int(directory.get_option("DocxContentLength")), directory.path))
-        if "spreadsheet" in p:
-            parsers.append(
-                SpreadSheetParser(chksum_calcs, int(directory.get_option("SpreadSheetContentLength")), directory.path))
-        if "ebook" in p:
-            parsers.append(EbookParser(chksum_calcs, int(directory.get_option("EbookContentLength")), directory.path))
+        if "tika" in p:
+            parsers.append(TikaFileParser(chksum_calcs, directory.path, int(directory.get_option("ContentLength"))))
        return parsers

    def execute_thumbnails(self, directory: Directory, total_files: Value, counter: Value, done: Value):
--- a/parsing.py
+++ b/parsing.py
@ -1,25 +1,14 @@
 import hashlib
-import os
-import mimetypes
-import subprocess
 import json
-import chardet
+import mimetypes
+import os
+import subprocess
 import warnings
-import docx2txt
-import xlrd
-from pdfminer.pdfparser import PDFParser, PDFSyntaxError
-from pdfminer.pdfdocument import PDFDocument
-from pdfminer.pdfpage import PDFPage
-from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
-from pdfminer.layout import LAParams, LTTextBox, LTTextLine
-from pdfminer.converter import PDFPageAggregator
-import html2text
-from ebooklib import epub
-import ebooklib
+
+import chardet
 from PIL import Image
 from fontTools.ttLib import TTFont, TTLibError
-import six
-from six.moves import xrange
+from common import tika


 class MimeGuesser:
@ -127,7 +116,7 @@ class GenericFileParser(FileParser):
    def __init__(self, checksum_calculators: list, root_dir: str):
        self.checksum_calculators = checksum_calculators
        self.root_dir = root_dir
-        self.root_dir_len = len(root_dir)+1
+        self.root_dir_len = len(root_dir) + 1

    def parse(self, full_path: str) -> dict:
        """
@ -335,186 +324,45 @@ class FontParser(GenericFileParser):
        return info


-class PdfFileParser(GenericFileParser):
+class TikaFileParser(GenericFileParser):
+    mime_types = [
+        "application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        "application/epub+zip",
+        "application/pdf", "application/x-pdf",
+    ]
    is_default = False

-    def __init__(self, checksum_calculators: list, content_length: int, root_dir):
+    def __init__(self, checksum_calculators: list, root_dir: str, content_len=4096):
        super().__init__(checksum_calculators, root_dir)
+        self.content_len = content_len

-        self.content_length = content_length
-
-        self.mime_types = [
-            "application/pdf", "application/x-pdf"
-        ]
-
-    def parse(self, full_path: str):
+    def parse(self, full_path: str) -> dict:
+        """
+        Parse a generic file
+        :param full_path: path of the file to parse
+        :return: dict information about the file
+        """
        info = super().parse(full_path)

-        if self.content_length > 0:
-            with open(full_path, "rb") as f:
-
-                try:
-                    parser = PDFParser(f)
-                    document = PDFDocument(parser)
-                except PDFSyntaxError:
-                    print("couldn't parse PDF " + full_path)
-                    return info
-
-                info["content"] = ""
-                if len(document.info) > 0 and "Title" in document.info[0] and document.info[0]["Title"] != b"":
-                    if isinstance(document.info[0]["Title"], bytes):
-                        info["content"] += document.info[0]["Title"].decode("utf-8", "replace") + "\n"
-                    else:
-                        info["content"] += document.info[0]["Title"].resolve().decode("utf-8", "replace") + "\n"
-
-                try:
-                    if document.is_extractable:
-                        resource_manager = PDFResourceManager()
-                        la_params = LAParams()
-
-                        device = PDFPageAggregator(resource_manager, laparams=la_params)
-                        interpreter = PDFPageInterpreter(resource_manager, device)
-
-                        for page in PDFPage.create_pages(document):
-
-                            interpreter.process_page(page)
-                            layout = device.get_result()
-
-                            for lt_obj in layout:
-                                if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
-
-                                    text = lt_obj.get_text()
-
-                                    if len(info["content"]) + len(text) <= self.content_length:
-                                        info["content"] += text
-                                    else:
-                                        info["content"] += text[0:self.content_length - len(info["content"])]
-                                        break
-                            else:
-                                continue
-                            break
-                    else:
-                        print("PDF is not extractable: " + full_path)
-                except ValueError:
-                    print("Couldn't parse page for " + full_path)
-
-        return info
-
-
-class EbookParser(GenericFileParser):
-    is_default = False
-
-    def __init__(self, checksum_calculators: list, content_length: int, root_dir):
-        super().__init__(checksum_calculators, root_dir)
-
-        self.content_length = content_length
-
-        self.mime_types = [
-            "application/epub+zip"
-        ]
-
-        self.html2text = html2text.HTML2Text()
-        self.html2text.ignore_images = True
-        self.html2text.ignore_emphasis = True
-
-    def parse(self, full_path: str):
-        info = super().parse(full_path)
-
-        book = epub.read_epub(full_path)
-
-        info["content"] = ""
-
-        for text in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
-
-            text = self.html2text.handle(text.content.decode("utf-8"))
-
-            if len(info["content"]) + len(text) <= self.content_length:
-                info["content"] += text
-            else:
-                info["content"] += text[0:self.content_length - len(info["content"])]
-                break
-
-        return info
-
-
-class DocxParser(GenericFileParser):
-    is_default = False
-
-    def __init__(self, checksum_calculators: list, content_length: int, root_dir):
-        super().__init__(checksum_calculators, root_dir)
-
-        self.content_length = content_length
-
-        self.mime_types = [
-            "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
-        ]
-
-    def parse(self, full_path: str):
-        info = super().parse(full_path)
-
-        if self.content_length > 0:
-            try:
-                text = docx2txt.process(full_path)
-
-                if len(text) < self.content_length:
-                    info["content"] = text
-                else:
-                    info["content"] = text[0:self.content_length]
-            except:
-                print("Couldn't parse Ebook: " + full_path)
-
-        return info
-
-
-class SpreadSheetParser(GenericFileParser):
-    is_default = False
-
-    def __init__(self, checksum_calculators: list, content_length: int, root_dir):
-        super().__init__(checksum_calculators, root_dir)
-
-        self.content_length = content_length
-
-        self.mime_types = [
-            "application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
-        ]
-
-    def parse(self, full_path: str):
-        info = super().parse(full_path)
-
-        # The MIT License (MIT)
-        # Copyright (c) 2014 Dean Malmgren
-        # https://github.com/deanmalmgren/textract/blob/master/textract/parsers/xlsx_parser.py
-
-        try:
-            workbook = xlrd.open_workbook(full_path)
-
-            sheets_name = workbook.sheet_names()
-            info["content"] = ""
-
-            for names in sheets_name:
-                worksheet = workbook.sheet_by_name(names)
-                num_rows = worksheet.nrows
-                num_cells = worksheet.ncols
-
-                for curr_row in range(num_rows):
-                    new_output = []
-                    for index_col in xrange(num_cells):
-                        value = worksheet.cell_value(curr_row, index_col)
-                        if value:
-                            if isinstance(value, (int, float)):
-                                value = six.text_type(value)
-                            new_output.append(value)
-
-                    if new_output:
-                        text = u' '.join(new_output) + u'\n'
-                        if len(info["content"]) + len(text) <= self.content_length:
-                            info["content"] += text
-                        else:
-                            info["content"] += text[0:self.content_length - len(info["content"])]
-                            break
-
+        if info["size"] == 0:
            return info

-        except xlrd.biffh.XLRDError:
-            print("Couldn't parse spreadsheet: " + full_path)
+        tika_res = tika.from_file(full_path)
+        if "metadata" not in tika_res:
+            return info
+        tika_meta = tika_res["metadata"]
+        tika_content = tika_res["content"]

+        if isinstance(tika_meta["Content-Type"], list):
+            info["mime"] = tika_meta["Content-Type"][0]
+        else:
+            info["mime"] = tika_meta["Content-Type"]
+
+        if tika_content:
+            info["content"] = tika_content.lstrip()[:self.content_len]
+
+        if "Content-Encoding" in tika_meta:
+            info["encoding"] = tika_meta["Content-Encoding"]
+
+        return info
--- a/requirements.txt
+++ b/requirements.txt
@ -17,4 +17,5 @@ docx2txt
 xlrd
 six
 cairosvg
-ffmpeg-python
+ffmpeg-python
+tika
--- a/test/test_DocxParser.py
+++ b/test/test_DocxParser.py
@ -1,5 +1,5 @@
 from unittest import TestCase
-from parsing import DocxParser
+from parsing import TikaFileParser
 import os

 dir_name = os.path.dirname(os.path.abspath(__file__))
@ -9,7 +9,7 @@ class DocxParserTest(TestCase):

    def test_parse_content(self):

-        parser = DocxParser([], 1000, dir_name + "/test_files/")
+        parser = TikaFileParser([], dir_name + "/test_files/", 1000)

        info = parser.parse(dir_name + "/test_files/docx1.docx")

--- a/test/test_EbookParser.py
+++ b/test/test_EbookParser.py
@ -1,5 +1,5 @@
 from unittest import TestCase
-from parsing import EbookParser
+from parsing import TikaFileParser
 import os

 dir_name = os.path.dirname(os.path.abspath(__file__))
@ -9,7 +9,7 @@ class EbookParserTest(TestCase):

    def test_parse_content(self):

-        parser = EbookParser([], 1000, dir_name + "/test_files/")
+        parser = TikaFileParser([], dir_name + "/test_files/", 1000)

        info = parser.parse(dir_name + "/test_files/epub1.epub")

--- a/test/test_PdfFileParser.py
+++ b/test/test_PdfFileParser.py
@ -1,5 +1,5 @@
 from unittest import TestCase
-from parsing import PdfFileParser
+from parsing import TikaFileParser
 import os

 dir_name = os.path.dirname(os.path.abspath(__file__))
@ -9,9 +9,8 @@ class PdfParserTest(TestCase):

    def test_parse_content(self):

-        parser = PdfFileParser([], 12488, "test_files/")
+        parser = TikaFileParser([], "test_files/", 12488)

        info = parser.parse(dir_name + "/test_files/pdf1.pdf")

        self.assertEqual(len(info["content"]), 12488)
-        self.assertTrue(info["content"].startswith("Rabies\n03/11/2011\nRabies"))
--- a/test/test_SpreadSheetParser.py
+++ b/test/test_SpreadSheetParser.py
@ -1,5 +1,5 @@
 from unittest import TestCase
-from parsing import SpreadSheetParser
+from parsing import TikaFileParser

 import os

@ -10,7 +10,7 @@ class PdfParserTest(TestCase):

    def test_parse_content_xls(self):

-        parser = SpreadSheetParser([], 1500, "test_files/")
+        parser = TikaFileParser([], "test_files/", 1500)

        info = parser.parse(dir_name + "/test_files/xls1.xls")

@ -18,7 +18,7 @@ class PdfParserTest(TestCase):

    def test_parse_content_xlsx(self):

-        parser = SpreadSheetParser([], 1500, "test_files/")
+        parser = TikaFileParser([], "test_files/", 1500)

        info = parser.parse(dir_name + "/test_files/xlsx1.xlsx")