mirror of
				https://github.com/simon987/hexlib.git
				synced 2025-10-31 16:16:52 +00:00 
			
		
		
		
	Fix clean html (again!)
This commit is contained in:
		
							parent
							
								
									31b35e3a32
								
							
						
					
					
						commit
						a7bf5b2d15
					
				| @ -3,5 +3,5 @@ import re | |||||||
| LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))") | LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))") | ||||||
| HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"") | HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"") | ||||||
| WHITESPACE_RE = re.compile(r"\s+") | WHITESPACE_RE = re.compile(r"\s+") | ||||||
| PUNCTUATION_RE = re.compile(r"[.,;:\"!?/()|*=]+") | PUNCTUATION_RE = re.compile(r"[.,;:\"!?/()|*=>]+") | ||||||
| XML_ENTITY_RE = re.compile(r"&[a-z]+;") | XML_ENTITY_RE = re.compile(r"&[a-z]+;") | ||||||
|  | |||||||
| @ -56,6 +56,8 @@ def preprocess(text, lowercase=False, clean_html=False, strip=False, remove_punc | |||||||
| 
 | 
 | ||||||
|     if clean_html: |     if clean_html: | ||||||
|         try: |         try: | ||||||
|  |             text = "<root>" + text + "</root>" | ||||||
|  | 
 | ||||||
|             parser = etree.XMLParser(recover=True) |             parser = etree.XMLParser(recover=True) | ||||||
|             root = etree.fromstring(text, parser) |             root = etree.fromstring(text, parser) | ||||||
| 
 | 
 | ||||||
|  | |||||||
							
								
								
									
										2
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								setup.py
									
									
									
									
									
								
							| @ -2,7 +2,7 @@ from setuptools import setup | |||||||
| 
 | 
 | ||||||
| setup( | setup( | ||||||
|     name="hexlib", |     name="hexlib", | ||||||
|     version="1.46", |     version="1.47", | ||||||
|     description="Misc utility methods", |     description="Misc utility methods", | ||||||
|     author="simon987", |     author="simon987", | ||||||
|     author_email="me@simon987.net", |     author_email="me@simon987.net", | ||||||
|  | |||||||
| @ -180,7 +180,25 @@ class TestText(TestCase): | |||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
|         expected = "217709510 is there a servant that is against civilization and humanity literally instant summon" |         expected = "217709510 is there a servant that is against civilization and humanity literally instant summon" | ||||||
|  |         self.assertEqual(cleaned, expected) | ||||||
| 
 | 
 | ||||||
|  |     def test_html_entity(self): | ||||||
|  |         text = "doesn't" | ||||||
|  | 
 | ||||||
|  |         cleaned = preprocess( | ||||||
|  |             text, | ||||||
|  |             clean_html=True, | ||||||
|  |             lowercase=True, | ||||||
|  |             remove_punctuation=True, | ||||||
|  |             strip=True, | ||||||
|  |             lemmatize=False, | ||||||
|  |             fix_single_quotes=True, | ||||||
|  |             remove_stopwords_en=False, | ||||||
|  |             remove_urls=False | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         expected = "doesn't" | ||||||
|  |         self.assertEqual(cleaned, expected) | ||||||
| 
 | 
 | ||||||
|     def test_html_invalid_attribute(self): |     def test_html_invalid_attribute(self): | ||||||
|         text = '<root><iframe width="560" height="315" src=" " title="youtube video player" frameborder="0" allowfullscreen></iframe></root>' |         text = '<root><iframe width="560" height="315" src=" " title="youtube video player" frameborder="0" allowfullscreen></iframe></root>' | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user