Replace docx/pdf/spreadsheet parsers with Tika

2025-10-24 04:26:52 +00:00 · 2019-06-13 15:30:50 -04:00 · 2019-06-13 15:30:50 -04:00 · 25ab9dd9c7
commit 25ab9dd9c7
parent 980babc5cc
9 changed files with 67 additions and 220 deletions
--- a/common.py
+++ b/common.py
@ -0,0 +1,10 @@
 import os
 if not os.path.exists("tika"):
    os.mkdir("tika")
 os.putenv("TIKA_PATH", os.path.join(__name__, "tika/"))
 os.putenv("TIKA_LOG_PATH", os.path.join(__name__, "tika/"))
 from tika import parser as tika, config
 config.getMimeTypes()
--- a/config.py
+++ b/config.py
@ -3,14 +3,11 @@ default_options = {
    "ThumbnailQuality": "85",
    "ThumbnailSize": "272",
    "ThumbnailColor": "FF00FF",
-    "TextFileContentLength": "2000",
+    "ContentLength": "4096",
-    "PdfFileContentLength": "2000",
+    "TextFileContentLength": "4096",
    "DocxContentLength": "2000",
    "SpreadSheetContentLength": "2000",
    "EbookContentLength": "2000",
    "MimeGuesser": "extension",  # extension, content
    "CheckSumCalculators": "",  # md5, sha1, sha256
-    "FileParsers": "media, text, picture, font, pdf, docx, spreadsheet, ebook"
+    "FileParsers": "media, text, picture, font, tika"
 }
 # Index documents after every X parsed files (Larger number will use more memory)
--- a/crawler.py
+++ b/crawler.py
@ -11,7 +11,7 @@ import config
 from indexer import Indexer
 from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \
    PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser, \
-    PdfFileParser, DocxParser, EbookParser, SpreadSheetParser
+    TikaFileParser
 from search import Search
 from storage import Directory
 from storage import Task, LocalStorage
@ -226,16 +226,8 @@ class TaskManager:
            parsers.append(PictureFileParser(chksum_calcs, directory.path))
        if "font" in p:
            parsers.append(FontParser(chksum_calcs, directory.path))
-        if "pdf" in p:
+        if "tika" in p:
-            parsers.append(
+            parsers.append(TikaFileParser(chksum_calcs, directory.path, int(directory.get_option("ContentLength"))))
                PdfFileParser(chksum_calcs, int(directory.get_option("PdfFileContentLength")), directory.path))
        if "docx" in p:
            parsers.append(DocxParser(chksum_calcs, int(directory.get_option("DocxContentLength")), directory.path))
        if "spreadsheet" in p:
            parsers.append(
                SpreadSheetParser(chksum_calcs, int(directory.get_option("SpreadSheetContentLength")), directory.path))
        if "ebook" in p:
            parsers.append(EbookParser(chksum_calcs, int(directory.get_option("EbookContentLength")), directory.path))
        return parsers
    def execute_thumbnails(self, directory: Directory, total_files: Value, counter: Value, done: Value):
--- a/parsing.py
+++ b/parsing.py
@ -1,25 +1,14 @@
 import hashlib
 import os
 import mimetypes
 import subprocess
 import json
-import chardet
+import mimetypes
 import os
 import subprocess
 import warnings
-import docx2txt
+
-import xlrd
+import chardet
 from pdfminer.pdfparser import PDFParser, PDFSyntaxError
 from pdfminer.pdfdocument import PDFDocument
 from pdfminer.pdfpage import PDFPage
 from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
 from pdfminer.layout import LAParams, LTTextBox, LTTextLine
 from pdfminer.converter import PDFPageAggregator
 import html2text
 from ebooklib import epub
 import ebooklib
 from PIL import Image
 from fontTools.ttLib import TTFont, TTLibError
-import six
+from common import tika
 from six.moves import xrange
 class MimeGuesser:
@ -127,7 +116,7 @@ class GenericFileParser(FileParser):
    def __init__(self, checksum_calculators: list, root_dir: str):
        self.checksum_calculators = checksum_calculators
        self.root_dir = root_dir
-        self.root_dir_len = len(root_dir)+1
+        self.root_dir_len = len(root_dir) + 1
    def parse(self, full_path: str) -> dict:
        """
@ -335,186 +324,45 @@ class FontParser(GenericFileParser):
        return info
-class PdfFileParser(GenericFileParser):
+class TikaFileParser(GenericFileParser):
    mime_types = [
        "application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
        "application/epub+zip",
        "application/pdf", "application/x-pdf",
    ]
    is_default = False
-    def __init__(self, checksum_calculators: list, content_length: int, root_dir):
+    def __init__(self, checksum_calculators: list, root_dir: str, content_len=4096):
        super().__init__(checksum_calculators, root_dir)
        self.content_len = content_len
-        self.content_length = content_length
+    def parse(self, full_path: str) -> dict:
-
+        """
-        self.mime_types = [
+        Parse a generic file
-            "application/pdf", "application/x-pdf"
+        :param full_path: path of the file to parse
-        ]
+        :return: dict information about the file
-
+        """
    def parse(self, full_path: str):
        info = super().parse(full_path)
-        if self.content_length > 0:
+        if info["size"] == 0:
            with open(full_path, "rb") as f:
                try:
                    parser = PDFParser(f)
                    document = PDFDocument(parser)
                except PDFSyntaxError:
                    print("couldn't parse PDF " + full_path)
                    return info
                info["content"] = ""
                if len(document.info) > 0 and "Title" in document.info[0] and document.info[0]["Title"] != b"":
                    if isinstance(document.info[0]["Title"], bytes):
                        info["content"] += document.info[0]["Title"].decode("utf-8", "replace") + "\n"
                    else:
                        info["content"] += document.info[0]["Title"].resolve().decode("utf-8", "replace") + "\n"
                try:
                    if document.is_extractable:
                        resource_manager = PDFResourceManager()
                        la_params = LAParams()
                        device = PDFPageAggregator(resource_manager, laparams=la_params)
                        interpreter = PDFPageInterpreter(resource_manager, device)
                        for page in PDFPage.create_pages(document):
                            interpreter.process_page(page)
                            layout = device.get_result()
                            for lt_obj in layout:
                                if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                                    text = lt_obj.get_text()
                                    if len(info["content"]) + len(text) <= self.content_length:
                                        info["content"] += text
                                    else:
                                        info["content"] += text[0:self.content_length - len(info["content"])]
                                        break
                            else:
                                continue
                            break
                    else:
                        print("PDF is not extractable: " + full_path)
                except ValueError:
                    print("Couldn't parse page for " + full_path)
        return info
 class EbookParser(GenericFileParser):
    is_default = False
    def __init__(self, checksum_calculators: list, content_length: int, root_dir):
        super().__init__(checksum_calculators, root_dir)
        self.content_length = content_length
        self.mime_types = [
            "application/epub+zip"
        ]
        self.html2text = html2text.HTML2Text()
        self.html2text.ignore_images = True
        self.html2text.ignore_emphasis = True
    def parse(self, full_path: str):
        info = super().parse(full_path)
        book = epub.read_epub(full_path)
        info["content"] = ""
        for text in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
            text = self.html2text.handle(text.content.decode("utf-8"))
            if len(info["content"]) + len(text) <= self.content_length:
                info["content"] += text
            else:
                info["content"] += text[0:self.content_length - len(info["content"])]
                break
        return info
 class DocxParser(GenericFileParser):
    is_default = False
    def __init__(self, checksum_calculators: list, content_length: int, root_dir):
        super().__init__(checksum_calculators, root_dir)
        self.content_length = content_length
        self.mime_types = [
            "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
        ]
    def parse(self, full_path: str):
        info = super().parse(full_path)
        if self.content_length > 0:
            try:
                text = docx2txt.process(full_path)
                if len(text) < self.content_length:
                    info["content"] = text
                else:
                    info["content"] = text[0:self.content_length]
            except:
                print("Couldn't parse Ebook: " + full_path)
        return info
 class SpreadSheetParser(GenericFileParser):
    is_default = False
    def __init__(self, checksum_calculators: list, content_length: int, root_dir):
        super().__init__(checksum_calculators, root_dir)
        self.content_length = content_length
        self.mime_types = [
            "application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
        ]
    def parse(self, full_path: str):
        info = super().parse(full_path)
        # The MIT License (MIT)
        # Copyright (c) 2014 Dean Malmgren
        # https://github.com/deanmalmgren/textract/blob/master/textract/parsers/xlsx_parser.py
        try:
            workbook = xlrd.open_workbook(full_path)
            sheets_name = workbook.sheet_names()
            info["content"] = ""
            for names in sheets_name:
                worksheet = workbook.sheet_by_name(names)
                num_rows = worksheet.nrows
                num_cells = worksheet.ncols
                for curr_row in range(num_rows):
                    new_output = []
                    for index_col in xrange(num_cells):
                        value = worksheet.cell_value(curr_row, index_col)
                        if value:
                            if isinstance(value, (int, float)):
                                value = six.text_type(value)
                            new_output.append(value)
                    if new_output:
                        text = u' '.join(new_output) + u'\n'
                        if len(info["content"]) + len(text) <= self.content_length:
                            info["content"] += text
                        else:
                            info["content"] += text[0:self.content_length - len(info["content"])]
                            break
            return info
-        except xlrd.biffh.XLRDError:
+        tika_res = tika.from_file(full_path)
-            print("Couldn't parse spreadsheet: " + full_path)
+        if "metadata" not in tika_res:
            return info
        tika_meta = tika_res["metadata"]
        tika_content = tika_res["content"]
        if isinstance(tika_meta["Content-Type"], list):
            info["mime"] = tika_meta["Content-Type"][0]
        else:
            info["mime"] = tika_meta["Content-Type"]
        if tika_content:
            info["content"] = tika_content.lstrip()[:self.content_len]
        if "Content-Encoding" in tika_meta:
            info["encoding"] = tika_meta["Content-Encoding"]
        return info
--- a/requirements.txt
+++ b/requirements.txt
@ -18,3 +18,4 @@ xlrd
 six
 cairosvg
 ffmpeg-python
 tika
--- a/test/test_DocxParser.py
+++ b/test/test_DocxParser.py
@ -1,5 +1,5 @@
 from unittest import TestCase
-from parsing import DocxParser
+from parsing import TikaFileParser
 import os
 dir_name = os.path.dirname(os.path.abspath(__file__))
@ -9,7 +9,7 @@ class DocxParserTest(TestCase):
    def test_parse_content(self):
-        parser = DocxParser([], 1000, dir_name + "/test_files/")
+        parser = TikaFileParser([], dir_name + "/test_files/", 1000)
        info = parser.parse(dir_name + "/test_files/docx1.docx")
--- a/test/test_EbookParser.py
+++ b/test/test_EbookParser.py
@ -1,5 +1,5 @@
 from unittest import TestCase
-from parsing import EbookParser
+from parsing import TikaFileParser
 import os
 dir_name = os.path.dirname(os.path.abspath(__file__))
@ -9,7 +9,7 @@ class EbookParserTest(TestCase):
    def test_parse_content(self):
-        parser = EbookParser([], 1000, dir_name + "/test_files/")
+        parser = TikaFileParser([], dir_name + "/test_files/", 1000)
        info = parser.parse(dir_name + "/test_files/epub1.epub")
--- a/test/test_PdfFileParser.py
+++ b/test/test_PdfFileParser.py
@ -1,5 +1,5 @@
 from unittest import TestCase
-from parsing import PdfFileParser
+from parsing import TikaFileParser
 import os
 dir_name = os.path.dirname(os.path.abspath(__file__))
@ -9,9 +9,8 @@ class PdfParserTest(TestCase):
    def test_parse_content(self):
-        parser = PdfFileParser([], 12488, "test_files/")
+        parser = TikaFileParser([], "test_files/", 12488)
        info = parser.parse(dir_name + "/test_files/pdf1.pdf")
        self.assertEqual(len(info["content"]), 12488)
        self.assertTrue(info["content"].startswith("Rabies\n03/11/2011\nRabies"))
--- a/test/test_SpreadSheetParser.py
+++ b/test/test_SpreadSheetParser.py
@ -1,5 +1,5 @@
 from unittest import TestCase
-from parsing import SpreadSheetParser
+from parsing import TikaFileParser
 import os
@ -10,7 +10,7 @@ class PdfParserTest(TestCase):
    def test_parse_content_xls(self):
-        parser = SpreadSheetParser([], 1500, "test_files/")
+        parser = TikaFileParser([], "test_files/", 1500)
        info = parser.parse(dir_name + "/test_files/xls1.xls")
@ -18,7 +18,7 @@ class PdfParserTest(TestCase):
    def test_parse_content_xlsx(self):
-        parser = SpreadSheetParser([], 1500, "test_files/")
+        parser = TikaFileParser([], "test_files/", 1500)
        info = parser.parse(dir_name + "/test_files/xlsx1.xlsx")