mirror of
https://github.com/simon987/Simple-Incremental-Search-Tool.git
synced 2025-04-10 14:06:41 +00:00
Replace docx/pdf/spreadsheet parsers with Tika
This commit is contained in:
parent
980babc5cc
commit
25ab9dd9c7
10
common.py
Normal file
10
common.py
Normal file
@ -0,0 +1,10 @@
|
||||
import os
|
||||
|
||||
if not os.path.exists("tika"):
|
||||
os.mkdir("tika")
|
||||
os.putenv("TIKA_PATH", os.path.join(__name__, "tika/"))
|
||||
os.putenv("TIKA_LOG_PATH", os.path.join(__name__, "tika/"))
|
||||
|
||||
from tika import parser as tika, config
|
||||
|
||||
config.getMimeTypes()
|
@ -3,14 +3,11 @@ default_options = {
|
||||
"ThumbnailQuality": "85",
|
||||
"ThumbnailSize": "272",
|
||||
"ThumbnailColor": "FF00FF",
|
||||
"TextFileContentLength": "2000",
|
||||
"PdfFileContentLength": "2000",
|
||||
"DocxContentLength": "2000",
|
||||
"SpreadSheetContentLength": "2000",
|
||||
"EbookContentLength": "2000",
|
||||
"ContentLength": "4096",
|
||||
"TextFileContentLength": "4096",
|
||||
"MimeGuesser": "extension", # extension, content
|
||||
"CheckSumCalculators": "", # md5, sha1, sha256
|
||||
"FileParsers": "media, text, picture, font, pdf, docx, spreadsheet, ebook"
|
||||
"FileParsers": "media, text, picture, font, tika"
|
||||
}
|
||||
|
||||
# Index documents after every X parsed files (Larger number will use more memory)
|
||||
|
14
crawler.py
14
crawler.py
@ -11,7 +11,7 @@ import config
|
||||
from indexer import Indexer
|
||||
from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \
|
||||
PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser, \
|
||||
PdfFileParser, DocxParser, EbookParser, SpreadSheetParser
|
||||
TikaFileParser
|
||||
from search import Search
|
||||
from storage import Directory
|
||||
from storage import Task, LocalStorage
|
||||
@ -226,16 +226,8 @@ class TaskManager:
|
||||
parsers.append(PictureFileParser(chksum_calcs, directory.path))
|
||||
if "font" in p:
|
||||
parsers.append(FontParser(chksum_calcs, directory.path))
|
||||
if "pdf" in p:
|
||||
parsers.append(
|
||||
PdfFileParser(chksum_calcs, int(directory.get_option("PdfFileContentLength")), directory.path))
|
||||
if "docx" in p:
|
||||
parsers.append(DocxParser(chksum_calcs, int(directory.get_option("DocxContentLength")), directory.path))
|
||||
if "spreadsheet" in p:
|
||||
parsers.append(
|
||||
SpreadSheetParser(chksum_calcs, int(directory.get_option("SpreadSheetContentLength")), directory.path))
|
||||
if "ebook" in p:
|
||||
parsers.append(EbookParser(chksum_calcs, int(directory.get_option("EbookContentLength")), directory.path))
|
||||
if "tika" in p:
|
||||
parsers.append(TikaFileParser(chksum_calcs, directory.path, int(directory.get_option("ContentLength"))))
|
||||
return parsers
|
||||
|
||||
def execute_thumbnails(self, directory: Directory, total_files: Value, counter: Value, done: Value):
|
||||
|
232
parsing.py
232
parsing.py
@ -1,25 +1,14 @@
|
||||
import hashlib
|
||||
import os
|
||||
import mimetypes
|
||||
import subprocess
|
||||
import json
|
||||
import chardet
|
||||
import mimetypes
|
||||
import os
|
||||
import subprocess
|
||||
import warnings
|
||||
import docx2txt
|
||||
import xlrd
|
||||
from pdfminer.pdfparser import PDFParser, PDFSyntaxError
|
||||
from pdfminer.pdfdocument import PDFDocument
|
||||
from pdfminer.pdfpage import PDFPage
|
||||
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
|
||||
from pdfminer.converter import PDFPageAggregator
|
||||
import html2text
|
||||
from ebooklib import epub
|
||||
import ebooklib
|
||||
|
||||
import chardet
|
||||
from PIL import Image
|
||||
from fontTools.ttLib import TTFont, TTLibError
|
||||
import six
|
||||
from six.moves import xrange
|
||||
from common import tika
|
||||
|
||||
|
||||
class MimeGuesser:
|
||||
@ -127,7 +116,7 @@ class GenericFileParser(FileParser):
|
||||
def __init__(self, checksum_calculators: list, root_dir: str):
|
||||
self.checksum_calculators = checksum_calculators
|
||||
self.root_dir = root_dir
|
||||
self.root_dir_len = len(root_dir)+1
|
||||
self.root_dir_len = len(root_dir) + 1
|
||||
|
||||
def parse(self, full_path: str) -> dict:
|
||||
"""
|
||||
@ -335,186 +324,45 @@ class FontParser(GenericFileParser):
|
||||
return info
|
||||
|
||||
|
||||
class PdfFileParser(GenericFileParser):
|
||||
class TikaFileParser(GenericFileParser):
|
||||
mime_types = [
|
||||
"application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"application/epub+zip",
|
||||
"application/pdf", "application/x-pdf",
|
||||
]
|
||||
is_default = False
|
||||
|
||||
def __init__(self, checksum_calculators: list, content_length: int, root_dir):
|
||||
def __init__(self, checksum_calculators: list, root_dir: str, content_len=4096):
|
||||
super().__init__(checksum_calculators, root_dir)
|
||||
self.content_len = content_len
|
||||
|
||||
self.content_length = content_length
|
||||
|
||||
self.mime_types = [
|
||||
"application/pdf", "application/x-pdf"
|
||||
]
|
||||
|
||||
def parse(self, full_path: str):
|
||||
def parse(self, full_path: str) -> dict:
|
||||
"""
|
||||
Parse a generic file
|
||||
:param full_path: path of the file to parse
|
||||
:return: dict information about the file
|
||||
"""
|
||||
info = super().parse(full_path)
|
||||
|
||||
if self.content_length > 0:
|
||||
with open(full_path, "rb") as f:
|
||||
|
||||
try:
|
||||
parser = PDFParser(f)
|
||||
document = PDFDocument(parser)
|
||||
except PDFSyntaxError:
|
||||
print("couldn't parse PDF " + full_path)
|
||||
return info
|
||||
|
||||
info["content"] = ""
|
||||
if len(document.info) > 0 and "Title" in document.info[0] and document.info[0]["Title"] != b"":
|
||||
if isinstance(document.info[0]["Title"], bytes):
|
||||
info["content"] += document.info[0]["Title"].decode("utf-8", "replace") + "\n"
|
||||
else:
|
||||
info["content"] += document.info[0]["Title"].resolve().decode("utf-8", "replace") + "\n"
|
||||
|
||||
try:
|
||||
if document.is_extractable:
|
||||
resource_manager = PDFResourceManager()
|
||||
la_params = LAParams()
|
||||
|
||||
device = PDFPageAggregator(resource_manager, laparams=la_params)
|
||||
interpreter = PDFPageInterpreter(resource_manager, device)
|
||||
|
||||
for page in PDFPage.create_pages(document):
|
||||
|
||||
interpreter.process_page(page)
|
||||
layout = device.get_result()
|
||||
|
||||
for lt_obj in layout:
|
||||
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
|
||||
|
||||
text = lt_obj.get_text()
|
||||
|
||||
if len(info["content"]) + len(text) <= self.content_length:
|
||||
info["content"] += text
|
||||
else:
|
||||
info["content"] += text[0:self.content_length - len(info["content"])]
|
||||
break
|
||||
else:
|
||||
continue
|
||||
break
|
||||
else:
|
||||
print("PDF is not extractable: " + full_path)
|
||||
except ValueError:
|
||||
print("Couldn't parse page for " + full_path)
|
||||
|
||||
return info
|
||||
|
||||
|
||||
class EbookParser(GenericFileParser):
|
||||
is_default = False
|
||||
|
||||
def __init__(self, checksum_calculators: list, content_length: int, root_dir):
|
||||
super().__init__(checksum_calculators, root_dir)
|
||||
|
||||
self.content_length = content_length
|
||||
|
||||
self.mime_types = [
|
||||
"application/epub+zip"
|
||||
]
|
||||
|
||||
self.html2text = html2text.HTML2Text()
|
||||
self.html2text.ignore_images = True
|
||||
self.html2text.ignore_emphasis = True
|
||||
|
||||
def parse(self, full_path: str):
|
||||
info = super().parse(full_path)
|
||||
|
||||
book = epub.read_epub(full_path)
|
||||
|
||||
info["content"] = ""
|
||||
|
||||
for text in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
|
||||
|
||||
text = self.html2text.handle(text.content.decode("utf-8"))
|
||||
|
||||
if len(info["content"]) + len(text) <= self.content_length:
|
||||
info["content"] += text
|
||||
else:
|
||||
info["content"] += text[0:self.content_length - len(info["content"])]
|
||||
break
|
||||
|
||||
return info
|
||||
|
||||
|
||||
class DocxParser(GenericFileParser):
|
||||
is_default = False
|
||||
|
||||
def __init__(self, checksum_calculators: list, content_length: int, root_dir):
|
||||
super().__init__(checksum_calculators, root_dir)
|
||||
|
||||
self.content_length = content_length
|
||||
|
||||
self.mime_types = [
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
]
|
||||
|
||||
def parse(self, full_path: str):
|
||||
info = super().parse(full_path)
|
||||
|
||||
if self.content_length > 0:
|
||||
try:
|
||||
text = docx2txt.process(full_path)
|
||||
|
||||
if len(text) < self.content_length:
|
||||
info["content"] = text
|
||||
else:
|
||||
info["content"] = text[0:self.content_length]
|
||||
except:
|
||||
print("Couldn't parse Ebook: " + full_path)
|
||||
|
||||
return info
|
||||
|
||||
|
||||
class SpreadSheetParser(GenericFileParser):
|
||||
is_default = False
|
||||
|
||||
def __init__(self, checksum_calculators: list, content_length: int, root_dir):
|
||||
super().__init__(checksum_calculators, root_dir)
|
||||
|
||||
self.content_length = content_length
|
||||
|
||||
self.mime_types = [
|
||||
"application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
]
|
||||
|
||||
def parse(self, full_path: str):
|
||||
info = super().parse(full_path)
|
||||
|
||||
# The MIT License (MIT)
|
||||
# Copyright (c) 2014 Dean Malmgren
|
||||
# https://github.com/deanmalmgren/textract/blob/master/textract/parsers/xlsx_parser.py
|
||||
|
||||
try:
|
||||
workbook = xlrd.open_workbook(full_path)
|
||||
|
||||
sheets_name = workbook.sheet_names()
|
||||
info["content"] = ""
|
||||
|
||||
for names in sheets_name:
|
||||
worksheet = workbook.sheet_by_name(names)
|
||||
num_rows = worksheet.nrows
|
||||
num_cells = worksheet.ncols
|
||||
|
||||
for curr_row in range(num_rows):
|
||||
new_output = []
|
||||
for index_col in xrange(num_cells):
|
||||
value = worksheet.cell_value(curr_row, index_col)
|
||||
if value:
|
||||
if isinstance(value, (int, float)):
|
||||
value = six.text_type(value)
|
||||
new_output.append(value)
|
||||
|
||||
if new_output:
|
||||
text = u' '.join(new_output) + u'\n'
|
||||
if len(info["content"]) + len(text) <= self.content_length:
|
||||
info["content"] += text
|
||||
else:
|
||||
info["content"] += text[0:self.content_length - len(info["content"])]
|
||||
break
|
||||
|
||||
if info["size"] == 0:
|
||||
return info
|
||||
|
||||
except xlrd.biffh.XLRDError:
|
||||
print("Couldn't parse spreadsheet: " + full_path)
|
||||
tika_res = tika.from_file(full_path)
|
||||
if "metadata" not in tika_res:
|
||||
return info
|
||||
tika_meta = tika_res["metadata"]
|
||||
tika_content = tika_res["content"]
|
||||
|
||||
if isinstance(tika_meta["Content-Type"], list):
|
||||
info["mime"] = tika_meta["Content-Type"][0]
|
||||
else:
|
||||
info["mime"] = tika_meta["Content-Type"]
|
||||
|
||||
if tika_content:
|
||||
info["content"] = tika_content.lstrip()[:self.content_len]
|
||||
|
||||
if "Content-Encoding" in tika_meta:
|
||||
info["encoding"] = tika_meta["Content-Encoding"]
|
||||
|
||||
return info
|
||||
|
@ -17,4 +17,5 @@ docx2txt
|
||||
xlrd
|
||||
six
|
||||
cairosvg
|
||||
ffmpeg-python
|
||||
ffmpeg-python
|
||||
tika
|
@ -1,5 +1,5 @@
|
||||
from unittest import TestCase
|
||||
from parsing import DocxParser
|
||||
from parsing import TikaFileParser
|
||||
import os
|
||||
|
||||
dir_name = os.path.dirname(os.path.abspath(__file__))
|
||||
@ -9,7 +9,7 @@ class DocxParserTest(TestCase):
|
||||
|
||||
def test_parse_content(self):
|
||||
|
||||
parser = DocxParser([], 1000, dir_name + "/test_files/")
|
||||
parser = TikaFileParser([], dir_name + "/test_files/", 1000)
|
||||
|
||||
info = parser.parse(dir_name + "/test_files/docx1.docx")
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
from unittest import TestCase
|
||||
from parsing import EbookParser
|
||||
from parsing import TikaFileParser
|
||||
import os
|
||||
|
||||
dir_name = os.path.dirname(os.path.abspath(__file__))
|
||||
@ -9,7 +9,7 @@ class EbookParserTest(TestCase):
|
||||
|
||||
def test_parse_content(self):
|
||||
|
||||
parser = EbookParser([], 1000, dir_name + "/test_files/")
|
||||
parser = TikaFileParser([], dir_name + "/test_files/", 1000)
|
||||
|
||||
info = parser.parse(dir_name + "/test_files/epub1.epub")
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
from unittest import TestCase
|
||||
from parsing import PdfFileParser
|
||||
from parsing import TikaFileParser
|
||||
import os
|
||||
|
||||
dir_name = os.path.dirname(os.path.abspath(__file__))
|
||||
@ -9,9 +9,8 @@ class PdfParserTest(TestCase):
|
||||
|
||||
def test_parse_content(self):
|
||||
|
||||
parser = PdfFileParser([], 12488, "test_files/")
|
||||
parser = TikaFileParser([], "test_files/", 12488)
|
||||
|
||||
info = parser.parse(dir_name + "/test_files/pdf1.pdf")
|
||||
|
||||
self.assertEqual(len(info["content"]), 12488)
|
||||
self.assertTrue(info["content"].startswith("Rabies\n03/11/2011\nRabies"))
|
||||
|
@ -1,5 +1,5 @@
|
||||
from unittest import TestCase
|
||||
from parsing import SpreadSheetParser
|
||||
from parsing import TikaFileParser
|
||||
|
||||
import os
|
||||
|
||||
@ -10,7 +10,7 @@ class PdfParserTest(TestCase):
|
||||
|
||||
def test_parse_content_xls(self):
|
||||
|
||||
parser = SpreadSheetParser([], 1500, "test_files/")
|
||||
parser = TikaFileParser([], "test_files/", 1500)
|
||||
|
||||
info = parser.parse(dir_name + "/test_files/xls1.xls")
|
||||
|
||||
@ -18,7 +18,7 @@ class PdfParserTest(TestCase):
|
||||
|
||||
def test_parse_content_xlsx(self):
|
||||
|
||||
parser = SpreadSheetParser([], 1500, "test_files/")
|
||||
parser = TikaFileParser([], "test_files/", 1500)
|
||||
|
||||
info = parser.parse(dir_name + "/test_files/xlsx1.xlsx")
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user