Replace docx/pdf/spreadsheet parsers with Tika

This commit is contained in:
simon 2019-06-13 15:30:50 -04:00
parent 980babc5cc
commit 25ab9dd9c7
9 changed files with 67 additions and 220 deletions

10
common.py Normal file
View File

@ -0,0 +1,10 @@
import os
if not os.path.exists("tika"):
os.mkdir("tika")
os.putenv("TIKA_PATH", os.path.join(__name__, "tika/"))
os.putenv("TIKA_LOG_PATH", os.path.join(__name__, "tika/"))
from tika import parser as tika, config
config.getMimeTypes()

View File

@ -3,14 +3,11 @@ default_options = {
"ThumbnailQuality": "85",
"ThumbnailSize": "272",
"ThumbnailColor": "FF00FF",
"TextFileContentLength": "2000",
"PdfFileContentLength": "2000",
"DocxContentLength": "2000",
"SpreadSheetContentLength": "2000",
"EbookContentLength": "2000",
"ContentLength": "4096",
"TextFileContentLength": "4096",
"MimeGuesser": "extension", # extension, content
"CheckSumCalculators": "", # md5, sha1, sha256
"FileParsers": "media, text, picture, font, pdf, docx, spreadsheet, ebook"
"FileParsers": "media, text, picture, font, tika"
}
# Index documents after every X parsed files (Larger number will use more memory)

View File

@ -11,7 +11,7 @@ import config
from indexer import Indexer
from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \
PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser, \
PdfFileParser, DocxParser, EbookParser, SpreadSheetParser
TikaFileParser
from search import Search
from storage import Directory
from storage import Task, LocalStorage
@ -226,16 +226,8 @@ class TaskManager:
parsers.append(PictureFileParser(chksum_calcs, directory.path))
if "font" in p:
parsers.append(FontParser(chksum_calcs, directory.path))
if "pdf" in p:
parsers.append(
PdfFileParser(chksum_calcs, int(directory.get_option("PdfFileContentLength")), directory.path))
if "docx" in p:
parsers.append(DocxParser(chksum_calcs, int(directory.get_option("DocxContentLength")), directory.path))
if "spreadsheet" in p:
parsers.append(
SpreadSheetParser(chksum_calcs, int(directory.get_option("SpreadSheetContentLength")), directory.path))
if "ebook" in p:
parsers.append(EbookParser(chksum_calcs, int(directory.get_option("EbookContentLength")), directory.path))
if "tika" in p:
parsers.append(TikaFileParser(chksum_calcs, directory.path, int(directory.get_option("ContentLength"))))
return parsers
def execute_thumbnails(self, directory: Directory, total_files: Value, counter: Value, done: Value):

View File

@ -1,25 +1,14 @@
import hashlib
import os
import mimetypes
import subprocess
import json
import chardet
import mimetypes
import os
import subprocess
import warnings
import docx2txt
import xlrd
from pdfminer.pdfparser import PDFParser, PDFSyntaxError
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from pdfminer.converter import PDFPageAggregator
import html2text
from ebooklib import epub
import ebooklib
import chardet
from PIL import Image
from fontTools.ttLib import TTFont, TTLibError
import six
from six.moves import xrange
from common import tika
class MimeGuesser:
@ -127,7 +116,7 @@ class GenericFileParser(FileParser):
def __init__(self, checksum_calculators: list, root_dir: str):
self.checksum_calculators = checksum_calculators
self.root_dir = root_dir
self.root_dir_len = len(root_dir)+1
self.root_dir_len = len(root_dir) + 1
def parse(self, full_path: str) -> dict:
"""
@ -335,186 +324,45 @@ class FontParser(GenericFileParser):
return info
class PdfFileParser(GenericFileParser):
class TikaFileParser(GenericFileParser):
mime_types = [
"application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/epub+zip",
"application/pdf", "application/x-pdf",
]
is_default = False
def __init__(self, checksum_calculators: list, content_length: int, root_dir):
def __init__(self, checksum_calculators: list, root_dir: str, content_len=4096):
super().__init__(checksum_calculators, root_dir)
self.content_len = content_len
self.content_length = content_length
self.mime_types = [
"application/pdf", "application/x-pdf"
]
def parse(self, full_path: str):
def parse(self, full_path: str) -> dict:
"""
Parse a generic file
:param full_path: path of the file to parse
:return: dict information about the file
"""
info = super().parse(full_path)
if self.content_length > 0:
with open(full_path, "rb") as f:
try:
parser = PDFParser(f)
document = PDFDocument(parser)
except PDFSyntaxError:
print("couldn't parse PDF " + full_path)
return info
info["content"] = ""
if len(document.info) > 0 and "Title" in document.info[0] and document.info[0]["Title"] != b"":
if isinstance(document.info[0]["Title"], bytes):
info["content"] += document.info[0]["Title"].decode("utf-8", "replace") + "\n"
else:
info["content"] += document.info[0]["Title"].resolve().decode("utf-8", "replace") + "\n"
try:
if document.is_extractable:
resource_manager = PDFResourceManager()
la_params = LAParams()
device = PDFPageAggregator(resource_manager, laparams=la_params)
interpreter = PDFPageInterpreter(resource_manager, device)
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
layout = device.get_result()
for lt_obj in layout:
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
text = lt_obj.get_text()
if len(info["content"]) + len(text) <= self.content_length:
info["content"] += text
else:
info["content"] += text[0:self.content_length - len(info["content"])]
break
else:
continue
break
else:
print("PDF is not extractable: " + full_path)
except ValueError:
print("Couldn't parse page for " + full_path)
return info
class EbookParser(GenericFileParser):
is_default = False
def __init__(self, checksum_calculators: list, content_length: int, root_dir):
super().__init__(checksum_calculators, root_dir)
self.content_length = content_length
self.mime_types = [
"application/epub+zip"
]
self.html2text = html2text.HTML2Text()
self.html2text.ignore_images = True
self.html2text.ignore_emphasis = True
def parse(self, full_path: str):
info = super().parse(full_path)
book = epub.read_epub(full_path)
info["content"] = ""
for text in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
text = self.html2text.handle(text.content.decode("utf-8"))
if len(info["content"]) + len(text) <= self.content_length:
info["content"] += text
else:
info["content"] += text[0:self.content_length - len(info["content"])]
break
return info
class DocxParser(GenericFileParser):
is_default = False
def __init__(self, checksum_calculators: list, content_length: int, root_dir):
super().__init__(checksum_calculators, root_dir)
self.content_length = content_length
self.mime_types = [
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
]
def parse(self, full_path: str):
info = super().parse(full_path)
if self.content_length > 0:
try:
text = docx2txt.process(full_path)
if len(text) < self.content_length:
info["content"] = text
else:
info["content"] = text[0:self.content_length]
except:
print("Couldn't parse Ebook: " + full_path)
return info
class SpreadSheetParser(GenericFileParser):
is_default = False
def __init__(self, checksum_calculators: list, content_length: int, root_dir):
super().__init__(checksum_calculators, root_dir)
self.content_length = content_length
self.mime_types = [
"application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
]
def parse(self, full_path: str):
info = super().parse(full_path)
# The MIT License (MIT)
# Copyright (c) 2014 Dean Malmgren
# https://github.com/deanmalmgren/textract/blob/master/textract/parsers/xlsx_parser.py
try:
workbook = xlrd.open_workbook(full_path)
sheets_name = workbook.sheet_names()
info["content"] = ""
for names in sheets_name:
worksheet = workbook.sheet_by_name(names)
num_rows = worksheet.nrows
num_cells = worksheet.ncols
for curr_row in range(num_rows):
new_output = []
for index_col in xrange(num_cells):
value = worksheet.cell_value(curr_row, index_col)
if value:
if isinstance(value, (int, float)):
value = six.text_type(value)
new_output.append(value)
if new_output:
text = u' '.join(new_output) + u'\n'
if len(info["content"]) + len(text) <= self.content_length:
info["content"] += text
else:
info["content"] += text[0:self.content_length - len(info["content"])]
break
if info["size"] == 0:
return info
except xlrd.biffh.XLRDError:
print("Couldn't parse spreadsheet: " + full_path)
tika_res = tika.from_file(full_path)
if "metadata" not in tika_res:
return info
tika_meta = tika_res["metadata"]
tika_content = tika_res["content"]
if isinstance(tika_meta["Content-Type"], list):
info["mime"] = tika_meta["Content-Type"][0]
else:
info["mime"] = tika_meta["Content-Type"]
if tika_content:
info["content"] = tika_content.lstrip()[:self.content_len]
if "Content-Encoding" in tika_meta:
info["encoding"] = tika_meta["Content-Encoding"]
return info

View File

@ -17,4 +17,5 @@ docx2txt
xlrd
six
cairosvg
ffmpeg-python
ffmpeg-python
tika

View File

@ -1,5 +1,5 @@
from unittest import TestCase
from parsing import DocxParser
from parsing import TikaFileParser
import os
dir_name = os.path.dirname(os.path.abspath(__file__))
@ -9,7 +9,7 @@ class DocxParserTest(TestCase):
def test_parse_content(self):
parser = DocxParser([], 1000, dir_name + "/test_files/")
parser = TikaFileParser([], dir_name + "/test_files/", 1000)
info = parser.parse(dir_name + "/test_files/docx1.docx")

View File

@ -1,5 +1,5 @@
from unittest import TestCase
from parsing import EbookParser
from parsing import TikaFileParser
import os
dir_name = os.path.dirname(os.path.abspath(__file__))
@ -9,7 +9,7 @@ class EbookParserTest(TestCase):
def test_parse_content(self):
parser = EbookParser([], 1000, dir_name + "/test_files/")
parser = TikaFileParser([], dir_name + "/test_files/", 1000)
info = parser.parse(dir_name + "/test_files/epub1.epub")

View File

@ -1,5 +1,5 @@
from unittest import TestCase
from parsing import PdfFileParser
from parsing import TikaFileParser
import os
dir_name = os.path.dirname(os.path.abspath(__file__))
@ -9,9 +9,8 @@ class PdfParserTest(TestCase):
def test_parse_content(self):
parser = PdfFileParser([], 12488, "test_files/")
parser = TikaFileParser([], "test_files/", 12488)
info = parser.parse(dir_name + "/test_files/pdf1.pdf")
self.assertEqual(len(info["content"]), 12488)
self.assertTrue(info["content"].startswith("Rabies\n03/11/2011\nRabies"))

View File

@ -1,5 +1,5 @@
from unittest import TestCase
from parsing import SpreadSheetParser
from parsing import TikaFileParser
import os
@ -10,7 +10,7 @@ class PdfParserTest(TestCase):
def test_parse_content_xls(self):
parser = SpreadSheetParser([], 1500, "test_files/")
parser = TikaFileParser([], "test_files/", 1500)
info = parser.parse(dir_name + "/test_files/xls1.xls")
@ -18,7 +18,7 @@ class PdfParserTest(TestCase):
def test_parse_content_xlsx(self):
parser = SpreadSheetParser([], 1500, "test_files/")
parser = TikaFileParser([], "test_files/", 1500)
info = parser.parse(dir_name + "/test_files/xlsx1.xlsx")