diff --git a/parsing.py b/parsing.py index b9b2a0d..3960901 100644 --- a/parsing.py +++ b/parsing.py @@ -8,6 +8,7 @@ import chardet import html import warnings import docx2txt +import xlrd from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage @@ -19,6 +20,8 @@ from ebooklib import epub import ebooklib from PIL import Image from fontTools.ttLib import TTFont, TTLibError +import six +from six.moves import xrange class MimeGuesser: @@ -476,3 +479,58 @@ class DocxParser(GenericFileParser): info["content"] = text[0:self.content_length] return info + + +class SpreadSheetParser(GenericFileParser): + is_default = False + + def __init__(self, checksum_calculators: list, content_length: int): + super().__init__(checksum_calculators) + + self.content_length = content_length + + self.mime_types = [ + "application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + ] + + def parse(self, full_path: str): + info = super().parse(full_path) + + # The MIT License (MIT) + # Copyright (c) 2014 Dean Malmgren + # https://github.com/deanmalmgren/textract/blob/master/textract/parsers/xlsx_parser.py + + try: + workbook = xlrd.open_workbook(full_path) + + sheets_name = workbook.sheet_names() + info["content"] = "" + + for names in sheets_name: + worksheet = workbook.sheet_by_name(names) + num_rows = worksheet.nrows + num_cells = worksheet.ncols + + for curr_row in range(num_rows): + row = worksheet.row(curr_row) + new_output = [] + for index_col in xrange(num_cells): + value = worksheet.cell_value(curr_row, index_col) + if value: + if isinstance(value, (int, float)): + value = six.text_type(value) + new_output.append(value) + + if new_output: + text = u' '.join(new_output) + u'\n' + if len(info["content"]) + len(text) <= self.content_length: + info["content"] += text + else: + info["content"] += text[0:self.content_length - len(info["content"])] + break + + return info + + except xlrd.biffh.XLRDError: + print("Couldn't parse spreadsheet: " + full_path) + diff --git a/requirements.txt b/requirements.txt index 8e7d3e7..e01b120 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,4 +13,6 @@ unicodedata2 pdfminer.six ebooklib html2text -docx2txt \ No newline at end of file +docx2txt +xlrd +six \ No newline at end of file diff --git a/spec/SpreadSheetParser_spec.py b/spec/SpreadSheetParser_spec.py new file mode 100644 index 0000000..dc44dfb --- /dev/null +++ b/spec/SpreadSheetParser_spec.py @@ -0,0 +1,21 @@ +from unittest import TestCase +from parsing import SpreadSheetParser + + +class PdfParserTest(TestCase): + + def test_parse_content_xls(self): + + parser = SpreadSheetParser([], 1500) + + info = parser.parse("test_files/xls1.xls") + + self.assertEqual(len(info["content"]), 1500) + + def test_parse_content_xlsx(self): + + parser = SpreadSheetParser([], 1500) + + info = parser.parse("test_files/xlsx1.xlsx") + + self.assertEqual(len(info["content"]), 1500) diff --git a/spec/test_files/xls1.xls b/spec/test_files/xls1.xls new file mode 100755 index 0000000..d617eb0 Binary files /dev/null and b/spec/test_files/xls1.xls differ diff --git a/spec/test_files/xlsx1.xlsx b/spec/test_files/xlsx1.xlsx new file mode 100755 index 0000000..81d6b1a Binary files /dev/null and b/spec/test_files/xlsx1.xlsx differ