Added xls & xlsx parsing

This commit is contained in:
simon987 2018-04-16 20:40:49 -04:00
parent 7e5d6fe1ac
commit dff7ddc511
5 changed files with 82 additions and 1 deletions

View File

@ -8,6 +8,7 @@ import chardet
import html import html
import warnings import warnings
import docx2txt import docx2txt
import xlrd
from pdfminer.pdfparser import PDFParser from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage from pdfminer.pdfpage import PDFPage
@ -19,6 +20,8 @@ from ebooklib import epub
import ebooklib import ebooklib
from PIL import Image from PIL import Image
from fontTools.ttLib import TTFont, TTLibError from fontTools.ttLib import TTFont, TTLibError
import six
from six.moves import xrange
class MimeGuesser: class MimeGuesser:
@ -476,3 +479,58 @@ class DocxParser(GenericFileParser):
info["content"] = text[0:self.content_length] info["content"] = text[0:self.content_length]
return info return info
class SpreadSheetParser(GenericFileParser):
is_default = False
def __init__(self, checksum_calculators: list, content_length: int):
super().__init__(checksum_calculators)
self.content_length = content_length
self.mime_types = [
"application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
]
def parse(self, full_path: str):
info = super().parse(full_path)
# The MIT License (MIT)
# Copyright (c) 2014 Dean Malmgren
# https://github.com/deanmalmgren/textract/blob/master/textract/parsers/xlsx_parser.py
try:
workbook = xlrd.open_workbook(full_path)
sheets_name = workbook.sheet_names()
info["content"] = ""
for names in sheets_name:
worksheet = workbook.sheet_by_name(names)
num_rows = worksheet.nrows
num_cells = worksheet.ncols
for curr_row in range(num_rows):
row = worksheet.row(curr_row)
new_output = []
for index_col in xrange(num_cells):
value = worksheet.cell_value(curr_row, index_col)
if value:
if isinstance(value, (int, float)):
value = six.text_type(value)
new_output.append(value)
if new_output:
text = u' '.join(new_output) + u'\n'
if len(info["content"]) + len(text) <= self.content_length:
info["content"] += text
else:
info["content"] += text[0:self.content_length - len(info["content"])]
break
return info
except xlrd.biffh.XLRDError:
print("Couldn't parse spreadsheet: " + full_path)

View File

@ -14,3 +14,5 @@ pdfminer.six
ebooklib ebooklib
html2text html2text
docx2txt docx2txt
xlrd
six

View File

@ -0,0 +1,21 @@
from unittest import TestCase
from parsing import SpreadSheetParser
class PdfParserTest(TestCase):
def test_parse_content_xls(self):
parser = SpreadSheetParser([], 1500)
info = parser.parse("test_files/xls1.xls")
self.assertEqual(len(info["content"]), 1500)
def test_parse_content_xlsx(self):
parser = SpreadSheetParser([], 1500)
info = parser.parse("test_files/xlsx1.xlsx")
self.assertEqual(len(info["content"]), 1500)

BIN
spec/test_files/xls1.xls Executable file

Binary file not shown.

BIN
spec/test_files/xlsx1.xlsx Executable file

Binary file not shown.