Replace docx/pdf/spreadsheet parsers with Tika

This commit is contained in:
simon 2019-06-13 15:30:50 -04:00
parent 980babc5cc
commit 25ab9dd9c7
9 changed files with 67 additions and 220 deletions

10
common.py Normal file
View File

@ -0,0 +1,10 @@
import os
if not os.path.exists("tika"):
os.mkdir("tika")
os.putenv("TIKA_PATH", os.path.join(__name__, "tika/"))
os.putenv("TIKA_LOG_PATH", os.path.join(__name__, "tika/"))
from tika import parser as tika, config
config.getMimeTypes()

View File

@ -3,14 +3,11 @@ default_options = {
"ThumbnailQuality": "85", "ThumbnailQuality": "85",
"ThumbnailSize": "272", "ThumbnailSize": "272",
"ThumbnailColor": "FF00FF", "ThumbnailColor": "FF00FF",
"TextFileContentLength": "2000", "ContentLength": "4096",
"PdfFileContentLength": "2000", "TextFileContentLength": "4096",
"DocxContentLength": "2000",
"SpreadSheetContentLength": "2000",
"EbookContentLength": "2000",
"MimeGuesser": "extension", # extension, content "MimeGuesser": "extension", # extension, content
"CheckSumCalculators": "", # md5, sha1, sha256 "CheckSumCalculators": "", # md5, sha1, sha256
"FileParsers": "media, text, picture, font, pdf, docx, spreadsheet, ebook" "FileParsers": "media, text, picture, font, tika"
} }
# Index documents after every X parsed files (Larger number will use more memory) # Index documents after every X parsed files (Larger number will use more memory)

View File

@ -11,7 +11,7 @@ import config
from indexer import Indexer from indexer import Indexer
from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \ from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \
PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser, \ PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser, \
PdfFileParser, DocxParser, EbookParser, SpreadSheetParser TikaFileParser
from search import Search from search import Search
from storage import Directory from storage import Directory
from storage import Task, LocalStorage from storage import Task, LocalStorage
@ -226,16 +226,8 @@ class TaskManager:
parsers.append(PictureFileParser(chksum_calcs, directory.path)) parsers.append(PictureFileParser(chksum_calcs, directory.path))
if "font" in p: if "font" in p:
parsers.append(FontParser(chksum_calcs, directory.path)) parsers.append(FontParser(chksum_calcs, directory.path))
if "pdf" in p: if "tika" in p:
parsers.append( parsers.append(TikaFileParser(chksum_calcs, directory.path, int(directory.get_option("ContentLength"))))
PdfFileParser(chksum_calcs, int(directory.get_option("PdfFileContentLength")), directory.path))
if "docx" in p:
parsers.append(DocxParser(chksum_calcs, int(directory.get_option("DocxContentLength")), directory.path))
if "spreadsheet" in p:
parsers.append(
SpreadSheetParser(chksum_calcs, int(directory.get_option("SpreadSheetContentLength")), directory.path))
if "ebook" in p:
parsers.append(EbookParser(chksum_calcs, int(directory.get_option("EbookContentLength")), directory.path))
return parsers return parsers
def execute_thumbnails(self, directory: Directory, total_files: Value, counter: Value, done: Value): def execute_thumbnails(self, directory: Directory, total_files: Value, counter: Value, done: Value):

View File

@ -1,25 +1,14 @@
import hashlib import hashlib
import os
import mimetypes
import subprocess
import json import json
import chardet import mimetypes
import os
import subprocess
import warnings import warnings
import docx2txt
import xlrd import chardet
from pdfminer.pdfparser import PDFParser, PDFSyntaxError
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from pdfminer.converter import PDFPageAggregator
import html2text
from ebooklib import epub
import ebooklib
from PIL import Image from PIL import Image
from fontTools.ttLib import TTFont, TTLibError from fontTools.ttLib import TTFont, TTLibError
import six from common import tika
from six.moves import xrange
class MimeGuesser: class MimeGuesser:
@ -127,7 +116,7 @@ class GenericFileParser(FileParser):
def __init__(self, checksum_calculators: list, root_dir: str): def __init__(self, checksum_calculators: list, root_dir: str):
self.checksum_calculators = checksum_calculators self.checksum_calculators = checksum_calculators
self.root_dir = root_dir self.root_dir = root_dir
self.root_dir_len = len(root_dir)+1 self.root_dir_len = len(root_dir) + 1
def parse(self, full_path: str) -> dict: def parse(self, full_path: str) -> dict:
""" """
@ -335,186 +324,45 @@ class FontParser(GenericFileParser):
return info return info
class PdfFileParser(GenericFileParser): class TikaFileParser(GenericFileParser):
mime_types = [
"application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/epub+zip",
"application/pdf", "application/x-pdf",
]
is_default = False is_default = False
def __init__(self, checksum_calculators: list, content_length: int, root_dir): def __init__(self, checksum_calculators: list, root_dir: str, content_len=4096):
super().__init__(checksum_calculators, root_dir) super().__init__(checksum_calculators, root_dir)
self.content_len = content_len
self.content_length = content_length def parse(self, full_path: str) -> dict:
"""
self.mime_types = [ Parse a generic file
"application/pdf", "application/x-pdf" :param full_path: path of the file to parse
] :return: dict information about the file
"""
def parse(self, full_path: str):
info = super().parse(full_path) info = super().parse(full_path)
if self.content_length > 0: if info["size"] == 0:
with open(full_path, "rb") as f:
try:
parser = PDFParser(f)
document = PDFDocument(parser)
except PDFSyntaxError:
print("couldn't parse PDF " + full_path)
return info
info["content"] = ""
if len(document.info) > 0 and "Title" in document.info[0] and document.info[0]["Title"] != b"":
if isinstance(document.info[0]["Title"], bytes):
info["content"] += document.info[0]["Title"].decode("utf-8", "replace") + "\n"
else:
info["content"] += document.info[0]["Title"].resolve().decode("utf-8", "replace") + "\n"
try:
if document.is_extractable:
resource_manager = PDFResourceManager()
la_params = LAParams()
device = PDFPageAggregator(resource_manager, laparams=la_params)
interpreter = PDFPageInterpreter(resource_manager, device)
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
layout = device.get_result()
for lt_obj in layout:
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
text = lt_obj.get_text()
if len(info["content"]) + len(text) <= self.content_length:
info["content"] += text
else:
info["content"] += text[0:self.content_length - len(info["content"])]
break
else:
continue
break
else:
print("PDF is not extractable: " + full_path)
except ValueError:
print("Couldn't parse page for " + full_path)
return info
class EbookParser(GenericFileParser):
is_default = False
def __init__(self, checksum_calculators: list, content_length: int, root_dir):
super().__init__(checksum_calculators, root_dir)
self.content_length = content_length
self.mime_types = [
"application/epub+zip"
]
self.html2text = html2text.HTML2Text()
self.html2text.ignore_images = True
self.html2text.ignore_emphasis = True
def parse(self, full_path: str):
info = super().parse(full_path)
book = epub.read_epub(full_path)
info["content"] = ""
for text in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
text = self.html2text.handle(text.content.decode("utf-8"))
if len(info["content"]) + len(text) <= self.content_length:
info["content"] += text
else:
info["content"] += text[0:self.content_length - len(info["content"])]
break
return info
class DocxParser(GenericFileParser):
is_default = False
def __init__(self, checksum_calculators: list, content_length: int, root_dir):
super().__init__(checksum_calculators, root_dir)
self.content_length = content_length
self.mime_types = [
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
]
def parse(self, full_path: str):
info = super().parse(full_path)
if self.content_length > 0:
try:
text = docx2txt.process(full_path)
if len(text) < self.content_length:
info["content"] = text
else:
info["content"] = text[0:self.content_length]
except:
print("Couldn't parse Ebook: " + full_path)
return info
class SpreadSheetParser(GenericFileParser):
is_default = False
def __init__(self, checksum_calculators: list, content_length: int, root_dir):
super().__init__(checksum_calculators, root_dir)
self.content_length = content_length
self.mime_types = [
"application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
]
def parse(self, full_path: str):
info = super().parse(full_path)
# The MIT License (MIT)
# Copyright (c) 2014 Dean Malmgren
# https://github.com/deanmalmgren/textract/blob/master/textract/parsers/xlsx_parser.py
try:
workbook = xlrd.open_workbook(full_path)
sheets_name = workbook.sheet_names()
info["content"] = ""
for names in sheets_name:
worksheet = workbook.sheet_by_name(names)
num_rows = worksheet.nrows
num_cells = worksheet.ncols
for curr_row in range(num_rows):
new_output = []
for index_col in xrange(num_cells):
value = worksheet.cell_value(curr_row, index_col)
if value:
if isinstance(value, (int, float)):
value = six.text_type(value)
new_output.append(value)
if new_output:
text = u' '.join(new_output) + u'\n'
if len(info["content"]) + len(text) <= self.content_length:
info["content"] += text
else:
info["content"] += text[0:self.content_length - len(info["content"])]
break
return info return info
except xlrd.biffh.XLRDError: tika_res = tika.from_file(full_path)
print("Couldn't parse spreadsheet: " + full_path) if "metadata" not in tika_res:
return info
tika_meta = tika_res["metadata"]
tika_content = tika_res["content"]
if isinstance(tika_meta["Content-Type"], list):
info["mime"] = tika_meta["Content-Type"][0]
else:
info["mime"] = tika_meta["Content-Type"]
if tika_content:
info["content"] = tika_content.lstrip()[:self.content_len]
if "Content-Encoding" in tika_meta:
info["encoding"] = tika_meta["Content-Encoding"]
return info

View File

@ -18,3 +18,4 @@ xlrd
six six
cairosvg cairosvg
ffmpeg-python ffmpeg-python
tika

View File

@ -1,5 +1,5 @@
from unittest import TestCase from unittest import TestCase
from parsing import DocxParser from parsing import TikaFileParser
import os import os
dir_name = os.path.dirname(os.path.abspath(__file__)) dir_name = os.path.dirname(os.path.abspath(__file__))
@ -9,7 +9,7 @@ class DocxParserTest(TestCase):
def test_parse_content(self): def test_parse_content(self):
parser = DocxParser([], 1000, dir_name + "/test_files/") parser = TikaFileParser([], dir_name + "/test_files/", 1000)
info = parser.parse(dir_name + "/test_files/docx1.docx") info = parser.parse(dir_name + "/test_files/docx1.docx")

View File

@ -1,5 +1,5 @@
from unittest import TestCase from unittest import TestCase
from parsing import EbookParser from parsing import TikaFileParser
import os import os
dir_name = os.path.dirname(os.path.abspath(__file__)) dir_name = os.path.dirname(os.path.abspath(__file__))
@ -9,7 +9,7 @@ class EbookParserTest(TestCase):
def test_parse_content(self): def test_parse_content(self):
parser = EbookParser([], 1000, dir_name + "/test_files/") parser = TikaFileParser([], dir_name + "/test_files/", 1000)
info = parser.parse(dir_name + "/test_files/epub1.epub") info = parser.parse(dir_name + "/test_files/epub1.epub")

View File

@ -1,5 +1,5 @@
from unittest import TestCase from unittest import TestCase
from parsing import PdfFileParser from parsing import TikaFileParser
import os import os
dir_name = os.path.dirname(os.path.abspath(__file__)) dir_name = os.path.dirname(os.path.abspath(__file__))
@ -9,9 +9,8 @@ class PdfParserTest(TestCase):
def test_parse_content(self): def test_parse_content(self):
parser = PdfFileParser([], 12488, "test_files/") parser = TikaFileParser([], "test_files/", 12488)
info = parser.parse(dir_name + "/test_files/pdf1.pdf") info = parser.parse(dir_name + "/test_files/pdf1.pdf")
self.assertEqual(len(info["content"]), 12488) self.assertEqual(len(info["content"]), 12488)
self.assertTrue(info["content"].startswith("Rabies\n03/11/2011\nRabies"))

View File

@ -1,5 +1,5 @@
from unittest import TestCase from unittest import TestCase
from parsing import SpreadSheetParser from parsing import TikaFileParser
import os import os
@ -10,7 +10,7 @@ class PdfParserTest(TestCase):
def test_parse_content_xls(self): def test_parse_content_xls(self):
parser = SpreadSheetParser([], 1500, "test_files/") parser = TikaFileParser([], "test_files/", 1500)
info = parser.parse(dir_name + "/test_files/xls1.xls") info = parser.parse(dir_name + "/test_files/xls1.xls")
@ -18,7 +18,7 @@ class PdfParserTest(TestCase):
def test_parse_content_xlsx(self): def test_parse_content_xlsx(self):
parser = SpreadSheetParser([], 1500, "test_files/") parser = TikaFileParser([], "test_files/", 1500)
info = parser.parse(dir_name + "/test_files/xlsx1.xlsx") info = parser.parse(dir_name + "/test_files/xlsx1.xlsx")