Added xls & xlsx parsing

This commit is contained in:
simon987 2018-04-16 20:40:49 -04:00
parent 7e5d6fe1ac
commit dff7ddc511
5 changed files with 82 additions and 1 deletions

View File

@ -8,6 +8,7 @@ import chardet
import html
import warnings
import docx2txt
import xlrd
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
@ -19,6 +20,8 @@ from ebooklib import epub
import ebooklib
from PIL import Image
from fontTools.ttLib import TTFont, TTLibError
import six
from six.moves import xrange
class MimeGuesser:
@ -476,3 +479,58 @@ class DocxParser(GenericFileParser):
info["content"] = text[0:self.content_length]
return info
class SpreadSheetParser(GenericFileParser):
is_default = False
def __init__(self, checksum_calculators: list, content_length: int):
super().__init__(checksum_calculators)
self.content_length = content_length
self.mime_types = [
"application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
]
def parse(self, full_path: str):
info = super().parse(full_path)
# The MIT License (MIT)
# Copyright (c) 2014 Dean Malmgren
# https://github.com/deanmalmgren/textract/blob/master/textract/parsers/xlsx_parser.py
try:
workbook = xlrd.open_workbook(full_path)
sheets_name = workbook.sheet_names()
info["content"] = ""
for names in sheets_name:
worksheet = workbook.sheet_by_name(names)
num_rows = worksheet.nrows
num_cells = worksheet.ncols
for curr_row in range(num_rows):
row = worksheet.row(curr_row)
new_output = []
for index_col in xrange(num_cells):
value = worksheet.cell_value(curr_row, index_col)
if value:
if isinstance(value, (int, float)):
value = six.text_type(value)
new_output.append(value)
if new_output:
text = u' '.join(new_output) + u'\n'
if len(info["content"]) + len(text) <= self.content_length:
info["content"] += text
else:
info["content"] += text[0:self.content_length - len(info["content"])]
break
return info
except xlrd.biffh.XLRDError:
print("Couldn't parse spreadsheet: " + full_path)

View File

@ -13,4 +13,6 @@ unicodedata2
pdfminer.six
ebooklib
html2text
docx2txt
docx2txt
xlrd
six

View File

@ -0,0 +1,21 @@
from unittest import TestCase
from parsing import SpreadSheetParser
class PdfParserTest(TestCase):
def test_parse_content_xls(self):
parser = SpreadSheetParser([], 1500)
info = parser.parse("test_files/xls1.xls")
self.assertEqual(len(info["content"]), 1500)
def test_parse_content_xlsx(self):
parser = SpreadSheetParser([], 1500)
info = parser.parse("test_files/xlsx1.xlsx")
self.assertEqual(len(info["content"]), 1500)

BIN
spec/test_files/xls1.xls Executable file

Binary file not shown.

BIN
spec/test_files/xlsx1.xlsx Executable file

Binary file not shown.