mirror of
				https://github.com/simon987/hexlib.git
				synced 2025-10-31 16:16:52 +00:00 
			
		
		
		
	Fix clean_html
This commit is contained in:
		
							parent
							
								
									db3e191983
								
							
						
					
					
						commit
						4d6c8018df
					
				| @ -4,3 +4,4 @@ LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))") | ||||
| HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"") | ||||
| WHITESPACE_RE = re.compile(r"\s+") | ||||
| PUNCTUATION_RE = re.compile(r"[.,;:\"!?/()|*=]+") | ||||
| XML_ENTITY_RE = re.compile(r"&[a-z]+;") | ||||
|  | ||||
| @ -6,7 +6,7 @@ from lxml import etree | ||||
| from nltk.corpus import stopwords | ||||
| from nltk.stem import WordNetLemmatizer | ||||
| 
 | ||||
| from .regex import WHITESPACE_RE, PUNCTUATION_RE, LINK_RE | ||||
| from .regex import WHITESPACE_RE, PUNCTUATION_RE, LINK_RE, XML_ENTITY_RE | ||||
| 
 | ||||
| get_text = etree.XPath("//text()") | ||||
| 
 | ||||
| @ -56,9 +56,16 @@ def preprocess(text, lowercase=False, clean_html=False, strip=False, remove_punc | ||||
| 
 | ||||
|     if clean_html: | ||||
|         try: | ||||
|             root = etree.fromstring(text.replace("&", "")) | ||||
|             text = "".join(get_text(root)) | ||||
|         except: | ||||
|             text = XML_ENTITY_RE.sub(" ", text) | ||||
|             text = text.replace("&", " ") | ||||
|             text = text.replace("<br>", "<br/>") | ||||
|             text = "<root>" + text + "</root>" | ||||
| 
 | ||||
|             root = etree.fromstring(text) | ||||
| 
 | ||||
|             text = " ".join(get_text(root)) | ||||
|         except Exception as e: | ||||
|             raise e | ||||
|             pass | ||||
| 
 | ||||
|     if remove_punctuation: | ||||
|  | ||||
| @ -164,6 +164,25 @@ class TestText(TestCase): | ||||
| 
 | ||||
|         self.assertEqual(cleaned, expected) | ||||
| 
 | ||||
|     def test_html_no_root(self): | ||||
|         text = "<a href=\"#p217709510\" class=\"quotelink\">>>217709510</a><br>Is there a servant that is against civilization and humanity?<br>Literally instant summon." | ||||
|          | ||||
|         cleaned = preprocess( | ||||
|             text, | ||||
|             clean_html=True, | ||||
|             lowercase=True, | ||||
|             remove_punctuation=True, | ||||
|             strip=True, | ||||
|             lemmatize=False, | ||||
|             fix_single_quotes=True, | ||||
|             remove_stopwords_en=False, | ||||
|             remove_urls=False | ||||
|         ) | ||||
|          | ||||
|         expected = "217709510 is there a servant that is against civilization and humanity literally instant summon" | ||||
| 
 | ||||
|         self.assertEqual(cleaned, expected) | ||||
| 
 | ||||
|     def test_bigrams(self): | ||||
|         text = "x A b c d e f g h" | ||||
|         cleaned = preprocess( | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user