diff --git a/common.py b/common.py new file mode 100644 index 0000000..1dc0f6b --- /dev/null +++ b/common.py @@ -0,0 +1,10 @@ +import os + +if not os.path.exists("tika"): + os.mkdir("tika") +os.putenv("TIKA_PATH", os.path.join(__name__, "tika/")) +os.putenv("TIKA_LOG_PATH", os.path.join(__name__, "tika/")) + +from tika import parser as tika, config + +config.getMimeTypes() diff --git a/config.py b/config.py index 2c1d543..170dcc2 100644 --- a/config.py +++ b/config.py @@ -3,14 +3,11 @@ default_options = { "ThumbnailQuality": "85", "ThumbnailSize": "272", "ThumbnailColor": "FF00FF", - "TextFileContentLength": "2000", - "PdfFileContentLength": "2000", - "DocxContentLength": "2000", - "SpreadSheetContentLength": "2000", - "EbookContentLength": "2000", + "ContentLength": "4096", + "TextFileContentLength": "4096", "MimeGuesser": "extension", # extension, content "CheckSumCalculators": "", # md5, sha1, sha256 - "FileParsers": "media, text, picture, font, pdf, docx, spreadsheet, ebook" + "FileParsers": "media, text, picture, font, tika" } # Index documents after every X parsed files (Larger number will use more memory) diff --git a/crawler.py b/crawler.py index 6b64b8e..b5a3970 100644 --- a/crawler.py +++ b/crawler.py @@ -11,7 +11,7 @@ import config from indexer import Indexer from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \ PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser, \ - PdfFileParser, DocxParser, EbookParser, SpreadSheetParser + TikaFileParser from search import Search from storage import Directory from storage import Task, LocalStorage @@ -226,16 +226,8 @@ class TaskManager: parsers.append(PictureFileParser(chksum_calcs, directory.path)) if "font" in p: parsers.append(FontParser(chksum_calcs, directory.path)) - if "pdf" in p: - parsers.append( - PdfFileParser(chksum_calcs, int(directory.get_option("PdfFileContentLength")), directory.path)) - if "docx" in p: - parsers.append(DocxParser(chksum_calcs, int(directory.get_option("DocxContentLength")), directory.path)) - if "spreadsheet" in p: - parsers.append( - SpreadSheetParser(chksum_calcs, int(directory.get_option("SpreadSheetContentLength")), directory.path)) - if "ebook" in p: - parsers.append(EbookParser(chksum_calcs, int(directory.get_option("EbookContentLength")), directory.path)) + if "tika" in p: + parsers.append(TikaFileParser(chksum_calcs, directory.path, int(directory.get_option("ContentLength")))) return parsers def execute_thumbnails(self, directory: Directory, total_files: Value, counter: Value, done: Value): diff --git a/parsing.py b/parsing.py index a411b37..6a4e3eb 100644 --- a/parsing.py +++ b/parsing.py @@ -1,25 +1,14 @@ import hashlib -import os -import mimetypes -import subprocess import json -import chardet +import mimetypes +import os +import subprocess import warnings -import docx2txt -import xlrd -from pdfminer.pdfparser import PDFParser, PDFSyntaxError -from pdfminer.pdfdocument import PDFDocument -from pdfminer.pdfpage import PDFPage -from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter -from pdfminer.layout import LAParams, LTTextBox, LTTextLine -from pdfminer.converter import PDFPageAggregator -import html2text -from ebooklib import epub -import ebooklib + +import chardet from PIL import Image from fontTools.ttLib import TTFont, TTLibError -import six -from six.moves import xrange +from common import tika class MimeGuesser: @@ -127,7 +116,7 @@ class GenericFileParser(FileParser): def __init__(self, checksum_calculators: list, root_dir: str): self.checksum_calculators = checksum_calculators self.root_dir = root_dir - self.root_dir_len = len(root_dir)+1 + self.root_dir_len = len(root_dir) + 1 def parse(self, full_path: str) -> dict: """ @@ -335,186 +324,45 @@ class FontParser(GenericFileParser): return info -class PdfFileParser(GenericFileParser): +class TikaFileParser(GenericFileParser): + mime_types = [ + "application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "application/epub+zip", + "application/pdf", "application/x-pdf", + ] is_default = False - def __init__(self, checksum_calculators: list, content_length: int, root_dir): + def __init__(self, checksum_calculators: list, root_dir: str, content_len=4096): super().__init__(checksum_calculators, root_dir) + self.content_len = content_len - self.content_length = content_length - - self.mime_types = [ - "application/pdf", "application/x-pdf" - ] - - def parse(self, full_path: str): + def parse(self, full_path: str) -> dict: + """ + Parse a generic file + :param full_path: path of the file to parse + :return: dict information about the file + """ info = super().parse(full_path) - if self.content_length > 0: - with open(full_path, "rb") as f: - - try: - parser = PDFParser(f) - document = PDFDocument(parser) - except PDFSyntaxError: - print("couldn't parse PDF " + full_path) - return info - - info["content"] = "" - if len(document.info) > 0 and "Title" in document.info[0] and document.info[0]["Title"] != b"": - if isinstance(document.info[0]["Title"], bytes): - info["content"] += document.info[0]["Title"].decode("utf-8", "replace") + "\n" - else: - info["content"] += document.info[0]["Title"].resolve().decode("utf-8", "replace") + "\n" - - try: - if document.is_extractable: - resource_manager = PDFResourceManager() - la_params = LAParams() - - device = PDFPageAggregator(resource_manager, laparams=la_params) - interpreter = PDFPageInterpreter(resource_manager, device) - - for page in PDFPage.create_pages(document): - - interpreter.process_page(page) - layout = device.get_result() - - for lt_obj in layout: - if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): - - text = lt_obj.get_text() - - if len(info["content"]) + len(text) <= self.content_length: - info["content"] += text - else: - info["content"] += text[0:self.content_length - len(info["content"])] - break - else: - continue - break - else: - print("PDF is not extractable: " + full_path) - except ValueError: - print("Couldn't parse page for " + full_path) - - return info - - -class EbookParser(GenericFileParser): - is_default = False - - def __init__(self, checksum_calculators: list, content_length: int, root_dir): - super().__init__(checksum_calculators, root_dir) - - self.content_length = content_length - - self.mime_types = [ - "application/epub+zip" - ] - - self.html2text = html2text.HTML2Text() - self.html2text.ignore_images = True - self.html2text.ignore_emphasis = True - - def parse(self, full_path: str): - info = super().parse(full_path) - - book = epub.read_epub(full_path) - - info["content"] = "" - - for text in book.get_items_of_type(ebooklib.ITEM_DOCUMENT): - - text = self.html2text.handle(text.content.decode("utf-8")) - - if len(info["content"]) + len(text) <= self.content_length: - info["content"] += text - else: - info["content"] += text[0:self.content_length - len(info["content"])] - break - - return info - - -class DocxParser(GenericFileParser): - is_default = False - - def __init__(self, checksum_calculators: list, content_length: int, root_dir): - super().__init__(checksum_calculators, root_dir) - - self.content_length = content_length - - self.mime_types = [ - "application/vnd.openxmlformats-officedocument.wordprocessingml.document" - ] - - def parse(self, full_path: str): - info = super().parse(full_path) - - if self.content_length > 0: - try: - text = docx2txt.process(full_path) - - if len(text) < self.content_length: - info["content"] = text - else: - info["content"] = text[0:self.content_length] - except: - print("Couldn't parse Ebook: " + full_path) - - return info - - -class SpreadSheetParser(GenericFileParser): - is_default = False - - def __init__(self, checksum_calculators: list, content_length: int, root_dir): - super().__init__(checksum_calculators, root_dir) - - self.content_length = content_length - - self.mime_types = [ - "application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" - ] - - def parse(self, full_path: str): - info = super().parse(full_path) - - # The MIT License (MIT) - # Copyright (c) 2014 Dean Malmgren - # https://github.com/deanmalmgren/textract/blob/master/textract/parsers/xlsx_parser.py - - try: - workbook = xlrd.open_workbook(full_path) - - sheets_name = workbook.sheet_names() - info["content"] = "" - - for names in sheets_name: - worksheet = workbook.sheet_by_name(names) - num_rows = worksheet.nrows - num_cells = worksheet.ncols - - for curr_row in range(num_rows): - new_output = [] - for index_col in xrange(num_cells): - value = worksheet.cell_value(curr_row, index_col) - if value: - if isinstance(value, (int, float)): - value = six.text_type(value) - new_output.append(value) - - if new_output: - text = u' '.join(new_output) + u'\n' - if len(info["content"]) + len(text) <= self.content_length: - info["content"] += text - else: - info["content"] += text[0:self.content_length - len(info["content"])] - break - + if info["size"] == 0: return info - except xlrd.biffh.XLRDError: - print("Couldn't parse spreadsheet: " + full_path) + tika_res = tika.from_file(full_path) + if "metadata" not in tika_res: + return info + tika_meta = tika_res["metadata"] + tika_content = tika_res["content"] + if isinstance(tika_meta["Content-Type"], list): + info["mime"] = tika_meta["Content-Type"][0] + else: + info["mime"] = tika_meta["Content-Type"] + + if tika_content: + info["content"] = tika_content.lstrip()[:self.content_len] + + if "Content-Encoding" in tika_meta: + info["encoding"] = tika_meta["Content-Encoding"] + + return info diff --git a/requirements.txt b/requirements.txt index f36d500..492b60c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,4 +17,5 @@ docx2txt xlrd six cairosvg -ffmpeg-python \ No newline at end of file +ffmpeg-python +tika \ No newline at end of file diff --git a/test/test_DocxParser.py b/test/test_DocxParser.py index 05b9905..295b503 100644 --- a/test/test_DocxParser.py +++ b/test/test_DocxParser.py @@ -1,5 +1,5 @@ from unittest import TestCase -from parsing import DocxParser +from parsing import TikaFileParser import os dir_name = os.path.dirname(os.path.abspath(__file__)) @@ -9,7 +9,7 @@ class DocxParserTest(TestCase): def test_parse_content(self): - parser = DocxParser([], 1000, dir_name + "/test_files/") + parser = TikaFileParser([], dir_name + "/test_files/", 1000) info = parser.parse(dir_name + "/test_files/docx1.docx") diff --git a/test/test_EbookParser.py b/test/test_EbookParser.py index 6ae2b48..ad02bd7 100644 --- a/test/test_EbookParser.py +++ b/test/test_EbookParser.py @@ -1,5 +1,5 @@ from unittest import TestCase -from parsing import EbookParser +from parsing import TikaFileParser import os dir_name = os.path.dirname(os.path.abspath(__file__)) @@ -9,7 +9,7 @@ class EbookParserTest(TestCase): def test_parse_content(self): - parser = EbookParser([], 1000, dir_name + "/test_files/") + parser = TikaFileParser([], dir_name + "/test_files/", 1000) info = parser.parse(dir_name + "/test_files/epub1.epub") diff --git a/test/test_PdfFileParser.py b/test/test_PdfFileParser.py index 3484a8f..fad2c35 100644 --- a/test/test_PdfFileParser.py +++ b/test/test_PdfFileParser.py @@ -1,5 +1,5 @@ from unittest import TestCase -from parsing import PdfFileParser +from parsing import TikaFileParser import os dir_name = os.path.dirname(os.path.abspath(__file__)) @@ -9,9 +9,8 @@ class PdfParserTest(TestCase): def test_parse_content(self): - parser = PdfFileParser([], 12488, "test_files/") + parser = TikaFileParser([], "test_files/", 12488) info = parser.parse(dir_name + "/test_files/pdf1.pdf") self.assertEqual(len(info["content"]), 12488) - self.assertTrue(info["content"].startswith("Rabies\n03/11/2011\nRabies")) diff --git a/test/test_SpreadSheetParser.py b/test/test_SpreadSheetParser.py index f99acac..af52feb 100644 --- a/test/test_SpreadSheetParser.py +++ b/test/test_SpreadSheetParser.py @@ -1,5 +1,5 @@ from unittest import TestCase -from parsing import SpreadSheetParser +from parsing import TikaFileParser import os @@ -10,7 +10,7 @@ class PdfParserTest(TestCase): def test_parse_content_xls(self): - parser = SpreadSheetParser([], 1500, "test_files/") + parser = TikaFileParser([], "test_files/", 1500) info = parser.parse(dir_name + "/test_files/xls1.xls") @@ -18,7 +18,7 @@ class PdfParserTest(TestCase): def test_parse_content_xlsx(self): - parser = SpreadSheetParser([], 1500, "test_files/") + parser = TikaFileParser([], "test_files/", 1500) info = parser.parse(dir_name + "/test_files/xlsx1.xlsx")