Added xls & xlsx parsing

2025-11-30 09:13:18 +00:00 · 2018-04-16 20:40:49 -04:00 · 2018-04-16 20:40:49 -04:00 · dff7ddc511
commit dff7ddc511
parent 7e5d6fe1ac
5 changed files with 82 additions and 1 deletions
--- a/parsing.py
+++ b/parsing.py
@ -8,6 +8,7 @@ import chardet
 import html
 import warnings
 import docx2txt
 import xlrd
 from pdfminer.pdfparser import PDFParser
 from pdfminer.pdfdocument import PDFDocument
 from pdfminer.pdfpage import PDFPage
@ -19,6 +20,8 @@ from ebooklib import epub
 import ebooklib
 from PIL import Image
 from fontTools.ttLib import TTFont, TTLibError
 import six
 from six.moves import xrange
 class MimeGuesser:
@ -476,3 +479,58 @@ class DocxParser(GenericFileParser):
            info["content"] = text[0:self.content_length]
        return info
 class SpreadSheetParser(GenericFileParser):
    is_default = False
    def __init__(self, checksum_calculators: list, content_length: int):
        super().__init__(checksum_calculators)
        self.content_length = content_length
        self.mime_types = [
            "application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
        ]
    def parse(self, full_path: str):
        info = super().parse(full_path)
        # The MIT License (MIT)
        # Copyright (c) 2014 Dean Malmgren
        # https://github.com/deanmalmgren/textract/blob/master/textract/parsers/xlsx_parser.py
        try:
            workbook = xlrd.open_workbook(full_path)
            sheets_name = workbook.sheet_names()
            info["content"] = ""
            for names in sheets_name:
                worksheet = workbook.sheet_by_name(names)
                num_rows = worksheet.nrows
                num_cells = worksheet.ncols
                for curr_row in range(num_rows):
                    row = worksheet.row(curr_row)
                    new_output = []
                    for index_col in xrange(num_cells):
                        value = worksheet.cell_value(curr_row, index_col)
                        if value:
                            if isinstance(value, (int, float)):
                                value = six.text_type(value)
                            new_output.append(value)
                    if new_output:
                        text = u' '.join(new_output) + u'\n'
                        if len(info["content"]) + len(text) <= self.content_length:
                            info["content"] += text
                        else:
                            info["content"] += text[0:self.content_length - len(info["content"])]
                            break
            return info
        except xlrd.biffh.XLRDError:
            print("Couldn't parse spreadsheet: " + full_path)
--- a/requirements.txt
+++ b/requirements.txt
@ -14,3 +14,5 @@ pdfminer.six
 ebooklib
 html2text
 docx2txt
 xlrd
 six
--- a/spec/SpreadSheetParser_spec.py
+++ b/spec/SpreadSheetParser_spec.py
@ -0,0 +1,21 @@
 from unittest import TestCase
 from parsing import SpreadSheetParser
 class PdfParserTest(TestCase):
    def test_parse_content_xls(self):
        parser = SpreadSheetParser([], 1500)
        info = parser.parse("test_files/xls1.xls")
        self.assertEqual(len(info["content"]), 1500)
    def test_parse_content_xlsx(self):
        parser = SpreadSheetParser([], 1500)
        info = parser.parse("test_files/xlsx1.xlsx")
        self.assertEqual(len(info["content"]), 1500)
--- a/spec/test_files/xls1.xls
+++ b/spec/test_files/xls1.xls
--- a/spec/test_files/xlsx1.xlsx
+++ b/spec/test_files/xlsx1.xlsx