mirror of
				https://github.com/simon987/Simple-Incremental-Search-Tool.git
				synced 2025-10-24 20:46:52 +00:00 
			
		
		
		
	Added xls & xlsx parsing
This commit is contained in:
		
							parent
							
								
									7e5d6fe1ac
								
							
						
					
					
						commit
						dff7ddc511
					
				
							
								
								
									
										58
									
								
								parsing.py
									
									
									
									
									
								
							
							
						
						
									
										58
									
								
								parsing.py
									
									
									
									
									
								
							| @ -8,6 +8,7 @@ import chardet | ||||
| import html | ||||
| import warnings | ||||
| import docx2txt | ||||
| import xlrd | ||||
| from pdfminer.pdfparser import PDFParser | ||||
| from pdfminer.pdfdocument import PDFDocument | ||||
| from pdfminer.pdfpage import PDFPage | ||||
| @ -19,6 +20,8 @@ from ebooklib import epub | ||||
| import ebooklib | ||||
| from PIL import Image | ||||
| from fontTools.ttLib import TTFont, TTLibError | ||||
| import six | ||||
| from six.moves import xrange | ||||
| 
 | ||||
| 
 | ||||
| class MimeGuesser: | ||||
| @ -476,3 +479,58 @@ class DocxParser(GenericFileParser): | ||||
|             info["content"] = text[0:self.content_length] | ||||
| 
 | ||||
|         return info | ||||
| 
 | ||||
| 
 | ||||
| class SpreadSheetParser(GenericFileParser): | ||||
|     is_default = False | ||||
| 
 | ||||
|     def __init__(self, checksum_calculators: list, content_length: int): | ||||
|         super().__init__(checksum_calculators) | ||||
| 
 | ||||
|         self.content_length = content_length | ||||
| 
 | ||||
|         self.mime_types = [ | ||||
|             "application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | ||||
|         ] | ||||
| 
 | ||||
|     def parse(self, full_path: str): | ||||
|         info = super().parse(full_path) | ||||
| 
 | ||||
|         # The MIT License (MIT) | ||||
|         # Copyright (c) 2014 Dean Malmgren | ||||
|         # https://github.com/deanmalmgren/textract/blob/master/textract/parsers/xlsx_parser.py | ||||
| 
 | ||||
|         try: | ||||
|             workbook = xlrd.open_workbook(full_path) | ||||
| 
 | ||||
|             sheets_name = workbook.sheet_names() | ||||
|             info["content"] = "" | ||||
| 
 | ||||
|             for names in sheets_name: | ||||
|                 worksheet = workbook.sheet_by_name(names) | ||||
|                 num_rows = worksheet.nrows | ||||
|                 num_cells = worksheet.ncols | ||||
| 
 | ||||
|                 for curr_row in range(num_rows): | ||||
|                     row = worksheet.row(curr_row) | ||||
|                     new_output = [] | ||||
|                     for index_col in xrange(num_cells): | ||||
|                         value = worksheet.cell_value(curr_row, index_col) | ||||
|                         if value: | ||||
|                             if isinstance(value, (int, float)): | ||||
|                                 value = six.text_type(value) | ||||
|                             new_output.append(value) | ||||
| 
 | ||||
|                     if new_output: | ||||
|                         text = u' '.join(new_output) + u'\n' | ||||
|                         if len(info["content"]) + len(text) <= self.content_length: | ||||
|                             info["content"] += text | ||||
|                         else: | ||||
|                             info["content"] += text[0:self.content_length - len(info["content"])] | ||||
|                             break | ||||
| 
 | ||||
|             return info | ||||
| 
 | ||||
|         except xlrd.biffh.XLRDError: | ||||
|             print("Couldn't parse spreadsheet: " + full_path) | ||||
| 
 | ||||
|  | ||||
| @ -14,3 +14,5 @@ pdfminer.six | ||||
| ebooklib | ||||
| html2text | ||||
| docx2txt | ||||
| xlrd | ||||
| six | ||||
							
								
								
									
										21
									
								
								spec/SpreadSheetParser_spec.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										21
									
								
								spec/SpreadSheetParser_spec.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,21 @@ | ||||
| from unittest import TestCase | ||||
| from parsing import SpreadSheetParser | ||||
| 
 | ||||
| 
 | ||||
| class PdfParserTest(TestCase): | ||||
| 
 | ||||
|     def test_parse_content_xls(self): | ||||
| 
 | ||||
|         parser = SpreadSheetParser([], 1500) | ||||
| 
 | ||||
|         info = parser.parse("test_files/xls1.xls") | ||||
| 
 | ||||
|         self.assertEqual(len(info["content"]), 1500) | ||||
| 
 | ||||
|     def test_parse_content_xlsx(self): | ||||
| 
 | ||||
|         parser = SpreadSheetParser([], 1500) | ||||
| 
 | ||||
|         info = parser.parse("test_files/xlsx1.xlsx") | ||||
| 
 | ||||
|         self.assertEqual(len(info["content"]), 1500) | ||||
							
								
								
									
										
											BIN
										
									
								
								spec/test_files/xls1.xls
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								spec/test_files/xls1.xls
									
									
									
									
									
										Executable file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								spec/test_files/xlsx1.xlsx
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								spec/test_files/xlsx1.xlsx
									
									
									
									
									
										Executable file
									
								
							
										
											Binary file not shown.
										
									
								
							
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user