mirror of
https://github.com/simon987/Simple-Incremental-Search-Tool.git
synced 2025-04-19 18:16:45 +00:00
Added xls & xlsx parsing
This commit is contained in:
parent
7e5d6fe1ac
commit
dff7ddc511
58
parsing.py
58
parsing.py
@ -8,6 +8,7 @@ import chardet
|
|||||||
import html
|
import html
|
||||||
import warnings
|
import warnings
|
||||||
import docx2txt
|
import docx2txt
|
||||||
|
import xlrd
|
||||||
from pdfminer.pdfparser import PDFParser
|
from pdfminer.pdfparser import PDFParser
|
||||||
from pdfminer.pdfdocument import PDFDocument
|
from pdfminer.pdfdocument import PDFDocument
|
||||||
from pdfminer.pdfpage import PDFPage
|
from pdfminer.pdfpage import PDFPage
|
||||||
@ -19,6 +20,8 @@ from ebooklib import epub
|
|||||||
import ebooklib
|
import ebooklib
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from fontTools.ttLib import TTFont, TTLibError
|
from fontTools.ttLib import TTFont, TTLibError
|
||||||
|
import six
|
||||||
|
from six.moves import xrange
|
||||||
|
|
||||||
|
|
||||||
class MimeGuesser:
|
class MimeGuesser:
|
||||||
@ -476,3 +479,58 @@ class DocxParser(GenericFileParser):
|
|||||||
info["content"] = text[0:self.content_length]
|
info["content"] = text[0:self.content_length]
|
||||||
|
|
||||||
return info
|
return info
|
||||||
|
|
||||||
|
|
||||||
|
class SpreadSheetParser(GenericFileParser):
|
||||||
|
is_default = False
|
||||||
|
|
||||||
|
def __init__(self, checksum_calculators: list, content_length: int):
|
||||||
|
super().__init__(checksum_calculators)
|
||||||
|
|
||||||
|
self.content_length = content_length
|
||||||
|
|
||||||
|
self.mime_types = [
|
||||||
|
"application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||||
|
]
|
||||||
|
|
||||||
|
def parse(self, full_path: str):
|
||||||
|
info = super().parse(full_path)
|
||||||
|
|
||||||
|
# The MIT License (MIT)
|
||||||
|
# Copyright (c) 2014 Dean Malmgren
|
||||||
|
# https://github.com/deanmalmgren/textract/blob/master/textract/parsers/xlsx_parser.py
|
||||||
|
|
||||||
|
try:
|
||||||
|
workbook = xlrd.open_workbook(full_path)
|
||||||
|
|
||||||
|
sheets_name = workbook.sheet_names()
|
||||||
|
info["content"] = ""
|
||||||
|
|
||||||
|
for names in sheets_name:
|
||||||
|
worksheet = workbook.sheet_by_name(names)
|
||||||
|
num_rows = worksheet.nrows
|
||||||
|
num_cells = worksheet.ncols
|
||||||
|
|
||||||
|
for curr_row in range(num_rows):
|
||||||
|
row = worksheet.row(curr_row)
|
||||||
|
new_output = []
|
||||||
|
for index_col in xrange(num_cells):
|
||||||
|
value = worksheet.cell_value(curr_row, index_col)
|
||||||
|
if value:
|
||||||
|
if isinstance(value, (int, float)):
|
||||||
|
value = six.text_type(value)
|
||||||
|
new_output.append(value)
|
||||||
|
|
||||||
|
if new_output:
|
||||||
|
text = u' '.join(new_output) + u'\n'
|
||||||
|
if len(info["content"]) + len(text) <= self.content_length:
|
||||||
|
info["content"] += text
|
||||||
|
else:
|
||||||
|
info["content"] += text[0:self.content_length - len(info["content"])]
|
||||||
|
break
|
||||||
|
|
||||||
|
return info
|
||||||
|
|
||||||
|
except xlrd.biffh.XLRDError:
|
||||||
|
print("Couldn't parse spreadsheet: " + full_path)
|
||||||
|
|
||||||
|
@ -14,3 +14,5 @@ pdfminer.six
|
|||||||
ebooklib
|
ebooklib
|
||||||
html2text
|
html2text
|
||||||
docx2txt
|
docx2txt
|
||||||
|
xlrd
|
||||||
|
six
|
21
spec/SpreadSheetParser_spec.py
Normal file
21
spec/SpreadSheetParser_spec.py
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
from unittest import TestCase
|
||||||
|
from parsing import SpreadSheetParser
|
||||||
|
|
||||||
|
|
||||||
|
class PdfParserTest(TestCase):
|
||||||
|
|
||||||
|
def test_parse_content_xls(self):
|
||||||
|
|
||||||
|
parser = SpreadSheetParser([], 1500)
|
||||||
|
|
||||||
|
info = parser.parse("test_files/xls1.xls")
|
||||||
|
|
||||||
|
self.assertEqual(len(info["content"]), 1500)
|
||||||
|
|
||||||
|
def test_parse_content_xlsx(self):
|
||||||
|
|
||||||
|
parser = SpreadSheetParser([], 1500)
|
||||||
|
|
||||||
|
info = parser.parse("test_files/xlsx1.xlsx")
|
||||||
|
|
||||||
|
self.assertEqual(len(info["content"]), 1500)
|
BIN
spec/test_files/xls1.xls
Executable file
BIN
spec/test_files/xls1.xls
Executable file
Binary file not shown.
BIN
spec/test_files/xlsx1.xlsx
Executable file
BIN
spec/test_files/xlsx1.xlsx
Executable file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user