mirror of
				https://github.com/simon987/Simple-Incremental-Search-Tool.git
				synced 2025-10-31 07:26:53 +00:00 
			
		
		
		
	Added xls & xlsx parsing
This commit is contained in:
		
							parent
							
								
									7e5d6fe1ac
								
							
						
					
					
						commit
						dff7ddc511
					
				
							
								
								
									
										58
									
								
								parsing.py
									
									
									
									
									
								
							
							
						
						
									
										58
									
								
								parsing.py
									
									
									
									
									
								
							| @ -8,6 +8,7 @@ import chardet | |||||||
| import html | import html | ||||||
| import warnings | import warnings | ||||||
| import docx2txt | import docx2txt | ||||||
|  | import xlrd | ||||||
| from pdfminer.pdfparser import PDFParser | from pdfminer.pdfparser import PDFParser | ||||||
| from pdfminer.pdfdocument import PDFDocument | from pdfminer.pdfdocument import PDFDocument | ||||||
| from pdfminer.pdfpage import PDFPage | from pdfminer.pdfpage import PDFPage | ||||||
| @ -19,6 +20,8 @@ from ebooklib import epub | |||||||
| import ebooklib | import ebooklib | ||||||
| from PIL import Image | from PIL import Image | ||||||
| from fontTools.ttLib import TTFont, TTLibError | from fontTools.ttLib import TTFont, TTLibError | ||||||
|  | import six | ||||||
|  | from six.moves import xrange | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class MimeGuesser: | class MimeGuesser: | ||||||
| @ -476,3 +479,58 @@ class DocxParser(GenericFileParser): | |||||||
|             info["content"] = text[0:self.content_length] |             info["content"] = text[0:self.content_length] | ||||||
| 
 | 
 | ||||||
|         return info |         return info | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class SpreadSheetParser(GenericFileParser): | ||||||
|  |     is_default = False | ||||||
|  | 
 | ||||||
|  |     def __init__(self, checksum_calculators: list, content_length: int): | ||||||
|  |         super().__init__(checksum_calculators) | ||||||
|  | 
 | ||||||
|  |         self.content_length = content_length | ||||||
|  | 
 | ||||||
|  |         self.mime_types = [ | ||||||
|  |             "application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | ||||||
|  |         ] | ||||||
|  | 
 | ||||||
|  |     def parse(self, full_path: str): | ||||||
|  |         info = super().parse(full_path) | ||||||
|  | 
 | ||||||
|  |         # The MIT License (MIT) | ||||||
|  |         # Copyright (c) 2014 Dean Malmgren | ||||||
|  |         # https://github.com/deanmalmgren/textract/blob/master/textract/parsers/xlsx_parser.py | ||||||
|  | 
 | ||||||
|  |         try: | ||||||
|  |             workbook = xlrd.open_workbook(full_path) | ||||||
|  | 
 | ||||||
|  |             sheets_name = workbook.sheet_names() | ||||||
|  |             info["content"] = "" | ||||||
|  | 
 | ||||||
|  |             for names in sheets_name: | ||||||
|  |                 worksheet = workbook.sheet_by_name(names) | ||||||
|  |                 num_rows = worksheet.nrows | ||||||
|  |                 num_cells = worksheet.ncols | ||||||
|  | 
 | ||||||
|  |                 for curr_row in range(num_rows): | ||||||
|  |                     row = worksheet.row(curr_row) | ||||||
|  |                     new_output = [] | ||||||
|  |                     for index_col in xrange(num_cells): | ||||||
|  |                         value = worksheet.cell_value(curr_row, index_col) | ||||||
|  |                         if value: | ||||||
|  |                             if isinstance(value, (int, float)): | ||||||
|  |                                 value = six.text_type(value) | ||||||
|  |                             new_output.append(value) | ||||||
|  | 
 | ||||||
|  |                     if new_output: | ||||||
|  |                         text = u' '.join(new_output) + u'\n' | ||||||
|  |                         if len(info["content"]) + len(text) <= self.content_length: | ||||||
|  |                             info["content"] += text | ||||||
|  |                         else: | ||||||
|  |                             info["content"] += text[0:self.content_length - len(info["content"])] | ||||||
|  |                             break | ||||||
|  | 
 | ||||||
|  |             return info | ||||||
|  | 
 | ||||||
|  |         except xlrd.biffh.XLRDError: | ||||||
|  |             print("Couldn't parse spreadsheet: " + full_path) | ||||||
|  | 
 | ||||||
|  | |||||||
| @ -14,3 +14,5 @@ pdfminer.six | |||||||
| ebooklib | ebooklib | ||||||
| html2text | html2text | ||||||
| docx2txt | docx2txt | ||||||
|  | xlrd | ||||||
|  | six | ||||||
							
								
								
									
										21
									
								
								spec/SpreadSheetParser_spec.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										21
									
								
								spec/SpreadSheetParser_spec.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,21 @@ | |||||||
|  | from unittest import TestCase | ||||||
|  | from parsing import SpreadSheetParser | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class PdfParserTest(TestCase): | ||||||
|  | 
 | ||||||
|  |     def test_parse_content_xls(self): | ||||||
|  | 
 | ||||||
|  |         parser = SpreadSheetParser([], 1500) | ||||||
|  | 
 | ||||||
|  |         info = parser.parse("test_files/xls1.xls") | ||||||
|  | 
 | ||||||
|  |         self.assertEqual(len(info["content"]), 1500) | ||||||
|  | 
 | ||||||
|  |     def test_parse_content_xlsx(self): | ||||||
|  | 
 | ||||||
|  |         parser = SpreadSheetParser([], 1500) | ||||||
|  | 
 | ||||||
|  |         info = parser.parse("test_files/xlsx1.xlsx") | ||||||
|  | 
 | ||||||
|  |         self.assertEqual(len(info["content"]), 1500) | ||||||
							
								
								
									
										
											BIN
										
									
								
								spec/test_files/xls1.xls
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								spec/test_files/xls1.xls
									
									
									
									
									
										Executable file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								spec/test_files/xlsx1.xlsx
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								spec/test_files/xlsx1.xlsx
									
									
									
									
									
										Executable file
									
								
							
										
											Binary file not shown.
										
									
								
							
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user