mirror of
https://github.com/simon987/Simple-Incremental-Search-Tool.git
synced 2025-10-24 04:26:52 +00:00
Replace docx/pdf/spreadsheet parsers with Tika
This commit is contained in:
parent
980babc5cc
commit
25ab9dd9c7
10
common.py
Normal file
10
common.py
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
if not os.path.exists("tika"):
|
||||||
|
os.mkdir("tika")
|
||||||
|
os.putenv("TIKA_PATH", os.path.join(__name__, "tika/"))
|
||||||
|
os.putenv("TIKA_LOG_PATH", os.path.join(__name__, "tika/"))
|
||||||
|
|
||||||
|
from tika import parser as tika, config
|
||||||
|
|
||||||
|
config.getMimeTypes()
|
@ -3,14 +3,11 @@ default_options = {
|
|||||||
"ThumbnailQuality": "85",
|
"ThumbnailQuality": "85",
|
||||||
"ThumbnailSize": "272",
|
"ThumbnailSize": "272",
|
||||||
"ThumbnailColor": "FF00FF",
|
"ThumbnailColor": "FF00FF",
|
||||||
"TextFileContentLength": "2000",
|
"ContentLength": "4096",
|
||||||
"PdfFileContentLength": "2000",
|
"TextFileContentLength": "4096",
|
||||||
"DocxContentLength": "2000",
|
|
||||||
"SpreadSheetContentLength": "2000",
|
|
||||||
"EbookContentLength": "2000",
|
|
||||||
"MimeGuesser": "extension", # extension, content
|
"MimeGuesser": "extension", # extension, content
|
||||||
"CheckSumCalculators": "", # md5, sha1, sha256
|
"CheckSumCalculators": "", # md5, sha1, sha256
|
||||||
"FileParsers": "media, text, picture, font, pdf, docx, spreadsheet, ebook"
|
"FileParsers": "media, text, picture, font, tika"
|
||||||
}
|
}
|
||||||
|
|
||||||
# Index documents after every X parsed files (Larger number will use more memory)
|
# Index documents after every X parsed files (Larger number will use more memory)
|
||||||
|
14
crawler.py
14
crawler.py
@ -11,7 +11,7 @@ import config
|
|||||||
from indexer import Indexer
|
from indexer import Indexer
|
||||||
from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \
|
from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \
|
||||||
PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser, \
|
PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser, \
|
||||||
PdfFileParser, DocxParser, EbookParser, SpreadSheetParser
|
TikaFileParser
|
||||||
from search import Search
|
from search import Search
|
||||||
from storage import Directory
|
from storage import Directory
|
||||||
from storage import Task, LocalStorage
|
from storage import Task, LocalStorage
|
||||||
@ -226,16 +226,8 @@ class TaskManager:
|
|||||||
parsers.append(PictureFileParser(chksum_calcs, directory.path))
|
parsers.append(PictureFileParser(chksum_calcs, directory.path))
|
||||||
if "font" in p:
|
if "font" in p:
|
||||||
parsers.append(FontParser(chksum_calcs, directory.path))
|
parsers.append(FontParser(chksum_calcs, directory.path))
|
||||||
if "pdf" in p:
|
if "tika" in p:
|
||||||
parsers.append(
|
parsers.append(TikaFileParser(chksum_calcs, directory.path, int(directory.get_option("ContentLength"))))
|
||||||
PdfFileParser(chksum_calcs, int(directory.get_option("PdfFileContentLength")), directory.path))
|
|
||||||
if "docx" in p:
|
|
||||||
parsers.append(DocxParser(chksum_calcs, int(directory.get_option("DocxContentLength")), directory.path))
|
|
||||||
if "spreadsheet" in p:
|
|
||||||
parsers.append(
|
|
||||||
SpreadSheetParser(chksum_calcs, int(directory.get_option("SpreadSheetContentLength")), directory.path))
|
|
||||||
if "ebook" in p:
|
|
||||||
parsers.append(EbookParser(chksum_calcs, int(directory.get_option("EbookContentLength")), directory.path))
|
|
||||||
return parsers
|
return parsers
|
||||||
|
|
||||||
def execute_thumbnails(self, directory: Directory, total_files: Value, counter: Value, done: Value):
|
def execute_thumbnails(self, directory: Directory, total_files: Value, counter: Value, done: Value):
|
||||||
|
232
parsing.py
232
parsing.py
@ -1,25 +1,14 @@
|
|||||||
import hashlib
|
import hashlib
|
||||||
import os
|
|
||||||
import mimetypes
|
|
||||||
import subprocess
|
|
||||||
import json
|
import json
|
||||||
import chardet
|
import mimetypes
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
import warnings
|
import warnings
|
||||||
import docx2txt
|
|
||||||
import xlrd
|
import chardet
|
||||||
from pdfminer.pdfparser import PDFParser, PDFSyntaxError
|
|
||||||
from pdfminer.pdfdocument import PDFDocument
|
|
||||||
from pdfminer.pdfpage import PDFPage
|
|
||||||
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
|
||||||
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
|
|
||||||
from pdfminer.converter import PDFPageAggregator
|
|
||||||
import html2text
|
|
||||||
from ebooklib import epub
|
|
||||||
import ebooklib
|
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from fontTools.ttLib import TTFont, TTLibError
|
from fontTools.ttLib import TTFont, TTLibError
|
||||||
import six
|
from common import tika
|
||||||
from six.moves import xrange
|
|
||||||
|
|
||||||
|
|
||||||
class MimeGuesser:
|
class MimeGuesser:
|
||||||
@ -127,7 +116,7 @@ class GenericFileParser(FileParser):
|
|||||||
def __init__(self, checksum_calculators: list, root_dir: str):
|
def __init__(self, checksum_calculators: list, root_dir: str):
|
||||||
self.checksum_calculators = checksum_calculators
|
self.checksum_calculators = checksum_calculators
|
||||||
self.root_dir = root_dir
|
self.root_dir = root_dir
|
||||||
self.root_dir_len = len(root_dir)+1
|
self.root_dir_len = len(root_dir) + 1
|
||||||
|
|
||||||
def parse(self, full_path: str) -> dict:
|
def parse(self, full_path: str) -> dict:
|
||||||
"""
|
"""
|
||||||
@ -335,186 +324,45 @@ class FontParser(GenericFileParser):
|
|||||||
return info
|
return info
|
||||||
|
|
||||||
|
|
||||||
class PdfFileParser(GenericFileParser):
|
class TikaFileParser(GenericFileParser):
|
||||||
|
mime_types = [
|
||||||
|
"application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||||
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||||
|
"application/epub+zip",
|
||||||
|
"application/pdf", "application/x-pdf",
|
||||||
|
]
|
||||||
is_default = False
|
is_default = False
|
||||||
|
|
||||||
def __init__(self, checksum_calculators: list, content_length: int, root_dir):
|
def __init__(self, checksum_calculators: list, root_dir: str, content_len=4096):
|
||||||
super().__init__(checksum_calculators, root_dir)
|
super().__init__(checksum_calculators, root_dir)
|
||||||
|
self.content_len = content_len
|
||||||
|
|
||||||
self.content_length = content_length
|
def parse(self, full_path: str) -> dict:
|
||||||
|
"""
|
||||||
self.mime_types = [
|
Parse a generic file
|
||||||
"application/pdf", "application/x-pdf"
|
:param full_path: path of the file to parse
|
||||||
]
|
:return: dict information about the file
|
||||||
|
"""
|
||||||
def parse(self, full_path: str):
|
|
||||||
info = super().parse(full_path)
|
info = super().parse(full_path)
|
||||||
|
|
||||||
if self.content_length > 0:
|
if info["size"] == 0:
|
||||||
with open(full_path, "rb") as f:
|
|
||||||
|
|
||||||
try:
|
|
||||||
parser = PDFParser(f)
|
|
||||||
document = PDFDocument(parser)
|
|
||||||
except PDFSyntaxError:
|
|
||||||
print("couldn't parse PDF " + full_path)
|
|
||||||
return info
|
|
||||||
|
|
||||||
info["content"] = ""
|
|
||||||
if len(document.info) > 0 and "Title" in document.info[0] and document.info[0]["Title"] != b"":
|
|
||||||
if isinstance(document.info[0]["Title"], bytes):
|
|
||||||
info["content"] += document.info[0]["Title"].decode("utf-8", "replace") + "\n"
|
|
||||||
else:
|
|
||||||
info["content"] += document.info[0]["Title"].resolve().decode("utf-8", "replace") + "\n"
|
|
||||||
|
|
||||||
try:
|
|
||||||
if document.is_extractable:
|
|
||||||
resource_manager = PDFResourceManager()
|
|
||||||
la_params = LAParams()
|
|
||||||
|
|
||||||
device = PDFPageAggregator(resource_manager, laparams=la_params)
|
|
||||||
interpreter = PDFPageInterpreter(resource_manager, device)
|
|
||||||
|
|
||||||
for page in PDFPage.create_pages(document):
|
|
||||||
|
|
||||||
interpreter.process_page(page)
|
|
||||||
layout = device.get_result()
|
|
||||||
|
|
||||||
for lt_obj in layout:
|
|
||||||
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
|
|
||||||
|
|
||||||
text = lt_obj.get_text()
|
|
||||||
|
|
||||||
if len(info["content"]) + len(text) <= self.content_length:
|
|
||||||
info["content"] += text
|
|
||||||
else:
|
|
||||||
info["content"] += text[0:self.content_length - len(info["content"])]
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
continue
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
print("PDF is not extractable: " + full_path)
|
|
||||||
except ValueError:
|
|
||||||
print("Couldn't parse page for " + full_path)
|
|
||||||
|
|
||||||
return info
|
|
||||||
|
|
||||||
|
|
||||||
class EbookParser(GenericFileParser):
|
|
||||||
is_default = False
|
|
||||||
|
|
||||||
def __init__(self, checksum_calculators: list, content_length: int, root_dir):
|
|
||||||
super().__init__(checksum_calculators, root_dir)
|
|
||||||
|
|
||||||
self.content_length = content_length
|
|
||||||
|
|
||||||
self.mime_types = [
|
|
||||||
"application/epub+zip"
|
|
||||||
]
|
|
||||||
|
|
||||||
self.html2text = html2text.HTML2Text()
|
|
||||||
self.html2text.ignore_images = True
|
|
||||||
self.html2text.ignore_emphasis = True
|
|
||||||
|
|
||||||
def parse(self, full_path: str):
|
|
||||||
info = super().parse(full_path)
|
|
||||||
|
|
||||||
book = epub.read_epub(full_path)
|
|
||||||
|
|
||||||
info["content"] = ""
|
|
||||||
|
|
||||||
for text in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
|
|
||||||
|
|
||||||
text = self.html2text.handle(text.content.decode("utf-8"))
|
|
||||||
|
|
||||||
if len(info["content"]) + len(text) <= self.content_length:
|
|
||||||
info["content"] += text
|
|
||||||
else:
|
|
||||||
info["content"] += text[0:self.content_length - len(info["content"])]
|
|
||||||
break
|
|
||||||
|
|
||||||
return info
|
|
||||||
|
|
||||||
|
|
||||||
class DocxParser(GenericFileParser):
|
|
||||||
is_default = False
|
|
||||||
|
|
||||||
def __init__(self, checksum_calculators: list, content_length: int, root_dir):
|
|
||||||
super().__init__(checksum_calculators, root_dir)
|
|
||||||
|
|
||||||
self.content_length = content_length
|
|
||||||
|
|
||||||
self.mime_types = [
|
|
||||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
|
||||||
]
|
|
||||||
|
|
||||||
def parse(self, full_path: str):
|
|
||||||
info = super().parse(full_path)
|
|
||||||
|
|
||||||
if self.content_length > 0:
|
|
||||||
try:
|
|
||||||
text = docx2txt.process(full_path)
|
|
||||||
|
|
||||||
if len(text) < self.content_length:
|
|
||||||
info["content"] = text
|
|
||||||
else:
|
|
||||||
info["content"] = text[0:self.content_length]
|
|
||||||
except:
|
|
||||||
print("Couldn't parse Ebook: " + full_path)
|
|
||||||
|
|
||||||
return info
|
|
||||||
|
|
||||||
|
|
||||||
class SpreadSheetParser(GenericFileParser):
|
|
||||||
is_default = False
|
|
||||||
|
|
||||||
def __init__(self, checksum_calculators: list, content_length: int, root_dir):
|
|
||||||
super().__init__(checksum_calculators, root_dir)
|
|
||||||
|
|
||||||
self.content_length = content_length
|
|
||||||
|
|
||||||
self.mime_types = [
|
|
||||||
"application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
|
||||||
]
|
|
||||||
|
|
||||||
def parse(self, full_path: str):
|
|
||||||
info = super().parse(full_path)
|
|
||||||
|
|
||||||
# The MIT License (MIT)
|
|
||||||
# Copyright (c) 2014 Dean Malmgren
|
|
||||||
# https://github.com/deanmalmgren/textract/blob/master/textract/parsers/xlsx_parser.py
|
|
||||||
|
|
||||||
try:
|
|
||||||
workbook = xlrd.open_workbook(full_path)
|
|
||||||
|
|
||||||
sheets_name = workbook.sheet_names()
|
|
||||||
info["content"] = ""
|
|
||||||
|
|
||||||
for names in sheets_name:
|
|
||||||
worksheet = workbook.sheet_by_name(names)
|
|
||||||
num_rows = worksheet.nrows
|
|
||||||
num_cells = worksheet.ncols
|
|
||||||
|
|
||||||
for curr_row in range(num_rows):
|
|
||||||
new_output = []
|
|
||||||
for index_col in xrange(num_cells):
|
|
||||||
value = worksheet.cell_value(curr_row, index_col)
|
|
||||||
if value:
|
|
||||||
if isinstance(value, (int, float)):
|
|
||||||
value = six.text_type(value)
|
|
||||||
new_output.append(value)
|
|
||||||
|
|
||||||
if new_output:
|
|
||||||
text = u' '.join(new_output) + u'\n'
|
|
||||||
if len(info["content"]) + len(text) <= self.content_length:
|
|
||||||
info["content"] += text
|
|
||||||
else:
|
|
||||||
info["content"] += text[0:self.content_length - len(info["content"])]
|
|
||||||
break
|
|
||||||
|
|
||||||
return info
|
return info
|
||||||
|
|
||||||
except xlrd.biffh.XLRDError:
|
tika_res = tika.from_file(full_path)
|
||||||
print("Couldn't parse spreadsheet: " + full_path)
|
if "metadata" not in tika_res:
|
||||||
|
return info
|
||||||
|
tika_meta = tika_res["metadata"]
|
||||||
|
tika_content = tika_res["content"]
|
||||||
|
|
||||||
|
if isinstance(tika_meta["Content-Type"], list):
|
||||||
|
info["mime"] = tika_meta["Content-Type"][0]
|
||||||
|
else:
|
||||||
|
info["mime"] = tika_meta["Content-Type"]
|
||||||
|
|
||||||
|
if tika_content:
|
||||||
|
info["content"] = tika_content.lstrip()[:self.content_len]
|
||||||
|
|
||||||
|
if "Content-Encoding" in tika_meta:
|
||||||
|
info["encoding"] = tika_meta["Content-Encoding"]
|
||||||
|
|
||||||
|
return info
|
||||||
|
@ -18,3 +18,4 @@ xlrd
|
|||||||
six
|
six
|
||||||
cairosvg
|
cairosvg
|
||||||
ffmpeg-python
|
ffmpeg-python
|
||||||
|
tika
|
@ -1,5 +1,5 @@
|
|||||||
from unittest import TestCase
|
from unittest import TestCase
|
||||||
from parsing import DocxParser
|
from parsing import TikaFileParser
|
||||||
import os
|
import os
|
||||||
|
|
||||||
dir_name = os.path.dirname(os.path.abspath(__file__))
|
dir_name = os.path.dirname(os.path.abspath(__file__))
|
||||||
@ -9,7 +9,7 @@ class DocxParserTest(TestCase):
|
|||||||
|
|
||||||
def test_parse_content(self):
|
def test_parse_content(self):
|
||||||
|
|
||||||
parser = DocxParser([], 1000, dir_name + "/test_files/")
|
parser = TikaFileParser([], dir_name + "/test_files/", 1000)
|
||||||
|
|
||||||
info = parser.parse(dir_name + "/test_files/docx1.docx")
|
info = parser.parse(dir_name + "/test_files/docx1.docx")
|
||||||
|
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
from unittest import TestCase
|
from unittest import TestCase
|
||||||
from parsing import EbookParser
|
from parsing import TikaFileParser
|
||||||
import os
|
import os
|
||||||
|
|
||||||
dir_name = os.path.dirname(os.path.abspath(__file__))
|
dir_name = os.path.dirname(os.path.abspath(__file__))
|
||||||
@ -9,7 +9,7 @@ class EbookParserTest(TestCase):
|
|||||||
|
|
||||||
def test_parse_content(self):
|
def test_parse_content(self):
|
||||||
|
|
||||||
parser = EbookParser([], 1000, dir_name + "/test_files/")
|
parser = TikaFileParser([], dir_name + "/test_files/", 1000)
|
||||||
|
|
||||||
info = parser.parse(dir_name + "/test_files/epub1.epub")
|
info = parser.parse(dir_name + "/test_files/epub1.epub")
|
||||||
|
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
from unittest import TestCase
|
from unittest import TestCase
|
||||||
from parsing import PdfFileParser
|
from parsing import TikaFileParser
|
||||||
import os
|
import os
|
||||||
|
|
||||||
dir_name = os.path.dirname(os.path.abspath(__file__))
|
dir_name = os.path.dirname(os.path.abspath(__file__))
|
||||||
@ -9,9 +9,8 @@ class PdfParserTest(TestCase):
|
|||||||
|
|
||||||
def test_parse_content(self):
|
def test_parse_content(self):
|
||||||
|
|
||||||
parser = PdfFileParser([], 12488, "test_files/")
|
parser = TikaFileParser([], "test_files/", 12488)
|
||||||
|
|
||||||
info = parser.parse(dir_name + "/test_files/pdf1.pdf")
|
info = parser.parse(dir_name + "/test_files/pdf1.pdf")
|
||||||
|
|
||||||
self.assertEqual(len(info["content"]), 12488)
|
self.assertEqual(len(info["content"]), 12488)
|
||||||
self.assertTrue(info["content"].startswith("Rabies\n03/11/2011\nRabies"))
|
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
from unittest import TestCase
|
from unittest import TestCase
|
||||||
from parsing import SpreadSheetParser
|
from parsing import TikaFileParser
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
@ -10,7 +10,7 @@ class PdfParserTest(TestCase):
|
|||||||
|
|
||||||
def test_parse_content_xls(self):
|
def test_parse_content_xls(self):
|
||||||
|
|
||||||
parser = SpreadSheetParser([], 1500, "test_files/")
|
parser = TikaFileParser([], "test_files/", 1500)
|
||||||
|
|
||||||
info = parser.parse(dir_name + "/test_files/xls1.xls")
|
info = parser.parse(dir_name + "/test_files/xls1.xls")
|
||||||
|
|
||||||
@ -18,7 +18,7 @@ class PdfParserTest(TestCase):
|
|||||||
|
|
||||||
def test_parse_content_xlsx(self):
|
def test_parse_content_xlsx(self):
|
||||||
|
|
||||||
parser = SpreadSheetParser([], 1500, "test_files/")
|
parser = TikaFileParser([], "test_files/", 1500)
|
||||||
|
|
||||||
info = parser.parse(dir_name + "/test_files/xlsx1.xlsx")
|
info = parser.parse(dir_name + "/test_files/xlsx1.xlsx")
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user