mirror of
https://github.com/simon987/Simple-Incremental-Search-Tool.git
synced 2025-04-10 14:06:41 +00:00
Added xls & xlsx parsing
This commit is contained in:
parent
7e5d6fe1ac
commit
dff7ddc511
58
parsing.py
58
parsing.py
@ -8,6 +8,7 @@ import chardet
|
||||
import html
|
||||
import warnings
|
||||
import docx2txt
|
||||
import xlrd
|
||||
from pdfminer.pdfparser import PDFParser
|
||||
from pdfminer.pdfdocument import PDFDocument
|
||||
from pdfminer.pdfpage import PDFPage
|
||||
@ -19,6 +20,8 @@ from ebooklib import epub
|
||||
import ebooklib
|
||||
from PIL import Image
|
||||
from fontTools.ttLib import TTFont, TTLibError
|
||||
import six
|
||||
from six.moves import xrange
|
||||
|
||||
|
||||
class MimeGuesser:
|
||||
@ -476,3 +479,58 @@ class DocxParser(GenericFileParser):
|
||||
info["content"] = text[0:self.content_length]
|
||||
|
||||
return info
|
||||
|
||||
|
||||
class SpreadSheetParser(GenericFileParser):
|
||||
is_default = False
|
||||
|
||||
def __init__(self, checksum_calculators: list, content_length: int):
|
||||
super().__init__(checksum_calculators)
|
||||
|
||||
self.content_length = content_length
|
||||
|
||||
self.mime_types = [
|
||||
"application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
]
|
||||
|
||||
def parse(self, full_path: str):
|
||||
info = super().parse(full_path)
|
||||
|
||||
# The MIT License (MIT)
|
||||
# Copyright (c) 2014 Dean Malmgren
|
||||
# https://github.com/deanmalmgren/textract/blob/master/textract/parsers/xlsx_parser.py
|
||||
|
||||
try:
|
||||
workbook = xlrd.open_workbook(full_path)
|
||||
|
||||
sheets_name = workbook.sheet_names()
|
||||
info["content"] = ""
|
||||
|
||||
for names in sheets_name:
|
||||
worksheet = workbook.sheet_by_name(names)
|
||||
num_rows = worksheet.nrows
|
||||
num_cells = worksheet.ncols
|
||||
|
||||
for curr_row in range(num_rows):
|
||||
row = worksheet.row(curr_row)
|
||||
new_output = []
|
||||
for index_col in xrange(num_cells):
|
||||
value = worksheet.cell_value(curr_row, index_col)
|
||||
if value:
|
||||
if isinstance(value, (int, float)):
|
||||
value = six.text_type(value)
|
||||
new_output.append(value)
|
||||
|
||||
if new_output:
|
||||
text = u' '.join(new_output) + u'\n'
|
||||
if len(info["content"]) + len(text) <= self.content_length:
|
||||
info["content"] += text
|
||||
else:
|
||||
info["content"] += text[0:self.content_length - len(info["content"])]
|
||||
break
|
||||
|
||||
return info
|
||||
|
||||
except xlrd.biffh.XLRDError:
|
||||
print("Couldn't parse spreadsheet: " + full_path)
|
||||
|
||||
|
@ -13,4 +13,6 @@ unicodedata2
|
||||
pdfminer.six
|
||||
ebooklib
|
||||
html2text
|
||||
docx2txt
|
||||
docx2txt
|
||||
xlrd
|
||||
six
|
21
spec/SpreadSheetParser_spec.py
Normal file
21
spec/SpreadSheetParser_spec.py
Normal file
@ -0,0 +1,21 @@
|
||||
from unittest import TestCase
|
||||
from parsing import SpreadSheetParser
|
||||
|
||||
|
||||
class PdfParserTest(TestCase):
|
||||
|
||||
def test_parse_content_xls(self):
|
||||
|
||||
parser = SpreadSheetParser([], 1500)
|
||||
|
||||
info = parser.parse("test_files/xls1.xls")
|
||||
|
||||
self.assertEqual(len(info["content"]), 1500)
|
||||
|
||||
def test_parse_content_xlsx(self):
|
||||
|
||||
parser = SpreadSheetParser([], 1500)
|
||||
|
||||
info = parser.parse("test_files/xlsx1.xlsx")
|
||||
|
||||
self.assertEqual(len(info["content"]), 1500)
|
BIN
spec/test_files/xls1.xls
Executable file
BIN
spec/test_files/xls1.xls
Executable file
Binary file not shown.
BIN
spec/test_files/xlsx1.xlsx
Executable file
BIN
spec/test_files/xlsx1.xlsx
Executable file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user