mirror of
				https://github.com/simon987/hexlib.git
				synced 2025-10-24 21:46:54 +00:00 
			
		
		
		
	Add text cleaning function
This commit is contained in:
		
							parent
							
								
									30902c8235
								
							
						
					
					
						commit
						765f6f59b7
					
				| @ -2,3 +2,5 @@ import re | |||||||
| 
 | 
 | ||||||
| LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))") | LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))") | ||||||
| HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"") | HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"") | ||||||
|  | WHITESPACE_RE = re.compile(r"\s+") | ||||||
|  | PUNCTUATION_RE = re.compile(r"[.,;:\"']+") | ||||||
|  | |||||||
							
								
								
									
										54
									
								
								hexlib/text.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										54
									
								
								hexlib/text.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,54 @@ | |||||||
|  | import nltk.corpus | ||||||
|  | from hexlib.misc import silent_stdout | ||||||
|  | from lxml import etree | ||||||
|  | from nltk.corpus import stopwords | ||||||
|  | from nltk.stem import WordNetLemmatizer | ||||||
|  | 
 | ||||||
|  | from .regex import WHITESPACE_RE, PUNCTUATION_RE | ||||||
|  | 
 | ||||||
|  | get_text = etree.XPath("//text()") | ||||||
|  | 
 | ||||||
|  | stop_words_en = set(stopwords.words("english")) | ||||||
|  | 
 | ||||||
|  | with silent_stdout: | ||||||
|  |     nltk.download("stopwords") | ||||||
|  |     nltk.download("wordnet") | ||||||
|  | 
 | ||||||
|  | lemmatizer = WordNetLemmatizer() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def clean(text, compress_whitespace=False, lowercase=False, clean_html=False, strip=False, remove_punctuation=False, | ||||||
|  |           remove_stopwords_en=False, lemmatize=False): | ||||||
|  |     if compress_whitespace and remove_stopwords_en: | ||||||
|  |         raise ValueError("Redundant flags: remove_stopwords implies compress_whitespace") | ||||||
|  | 
 | ||||||
|  |     if clean_html: | ||||||
|  |         try: | ||||||
|  |             root = etree.fromstring(text) | ||||||
|  |             text = "".join(get_text(root)) | ||||||
|  |         except: | ||||||
|  |             pass | ||||||
|  | 
 | ||||||
|  |     if lowercase: | ||||||
|  |         text = text.lower() | ||||||
|  | 
 | ||||||
|  |     if compress_whitespace: | ||||||
|  |         text = WHITESPACE_RE.sub(" ", text) | ||||||
|  | 
 | ||||||
|  |     if strip: | ||||||
|  |         text = text.strip() | ||||||
|  | 
 | ||||||
|  |     if remove_punctuation: | ||||||
|  |         text = PUNCTUATION_RE.sub("", text) | ||||||
|  | 
 | ||||||
|  |     if remove_stopwords_en or lemmatize: | ||||||
|  |         words = WHITESPACE_RE.split(text) | ||||||
|  | 
 | ||||||
|  |         if lemmatize and remove_stopwords_en: | ||||||
|  |             text = " ".join(lemmatizer.lemmatize(w) for w in words if w not in stop_words_en) | ||||||
|  |         elif not lemmatize and remove_stopwords_en: | ||||||
|  |             text = " ".join(w for w in words if w not in stop_words_en) | ||||||
|  |         elif lemmatize and not remove_stopwords_en: | ||||||
|  |             text = " ".join(lemmatizer.lemmatize(w) for w in words) | ||||||
|  | 
 | ||||||
|  |     return text | ||||||
							
								
								
									
										4
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										4
									
								
								setup.py
									
									
									
									
									
								
							| @ -2,7 +2,7 @@ from setuptools import setup | |||||||
| 
 | 
 | ||||||
| setup( | setup( | ||||||
|     name="hexlib", |     name="hexlib", | ||||||
|     version="1.38", |     version="1.39", | ||||||
|     description="Misc utility methods", |     description="Misc utility methods", | ||||||
|     author="simon987", |     author="simon987", | ||||||
|     author_email="me@simon987.net", |     author_email="me@simon987.net", | ||||||
| @ -13,6 +13,6 @@ setup( | |||||||
|     ]}, |     ]}, | ||||||
|     install_requires=[ |     install_requires=[ | ||||||
|         "ImageHash", "influxdb", "siphash", "python-dateutil", "redis", "orjson", "zstandard", |         "ImageHash", "influxdb", "siphash", "python-dateutil", "redis", "orjson", "zstandard", | ||||||
|         "u-msgpack-python", "psycopg2-binary", "fake-useragent", "bs4" |         "u-msgpack-python", "psycopg2-binary", "fake-useragent", "bs4", "lxml", "nltk" | ||||||
|     ] |     ] | ||||||
| ) | ) | ||||||
|  | |||||||
							
								
								
									
										105
									
								
								test/test_text.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										105
									
								
								test/test_text.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,105 @@ | |||||||
|  | from unittest import TestCase | ||||||
|  | 
 | ||||||
|  | from hexlib.text import clean | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class TestText(TestCase): | ||||||
|  | 
 | ||||||
|  |     def test_html_invalid(self): | ||||||
|  |         text = "" | ||||||
|  |         cleaned = clean( | ||||||
|  |             text, | ||||||
|  |             clean_html=True, | ||||||
|  |         ) | ||||||
|  |         expected = "" | ||||||
|  | 
 | ||||||
|  |         self.assertEqual(cleaned, expected) | ||||||
|  | 
 | ||||||
|  |     def test_html_1(self): | ||||||
|  |         text = "<div>Hello, <strong>world</strong></div>" | ||||||
|  |         cleaned = clean( | ||||||
|  |             text, | ||||||
|  |             clean_html=True, | ||||||
|  |         ) | ||||||
|  |         expected = "Hello, world" | ||||||
|  | 
 | ||||||
|  |         self.assertEqual(cleaned, expected) | ||||||
|  | 
 | ||||||
|  |     def test_html_2(self): | ||||||
|  |         text = "<div>Hello, <strong>world</strong></div>" | ||||||
|  |         cleaned = clean( | ||||||
|  |             text, | ||||||
|  |             clean_html=True, | ||||||
|  |             lowercase=True | ||||||
|  |         ) | ||||||
|  |         expected = "hello, world" | ||||||
|  | 
 | ||||||
|  |         self.assertEqual(cleaned, expected) | ||||||
|  | 
 | ||||||
|  |     def test_html_3(self): | ||||||
|  |         text = "<div>\n Hello, \t\n<strong> world    </strong>\n\t</div>" | ||||||
|  |         cleaned = clean( | ||||||
|  |             text, | ||||||
|  |             clean_html=True, | ||||||
|  |             lowercase=True, | ||||||
|  |             compress_whitespace=True | ||||||
|  |         ) | ||||||
|  |         expected = " hello, world " | ||||||
|  | 
 | ||||||
|  |         self.assertEqual(cleaned, expected) | ||||||
|  | 
 | ||||||
|  |     def test_html_4(self): | ||||||
|  |         text = "<div>\n Hello, \t\n<strong> world    </strong>\n\t</div>" | ||||||
|  |         cleaned = clean( | ||||||
|  |             text, | ||||||
|  |             clean_html=True, | ||||||
|  |             lowercase=True, | ||||||
|  |             compress_whitespace=True, | ||||||
|  |             strip=True | ||||||
|  |         ) | ||||||
|  |         expected = "hello, world" | ||||||
|  | 
 | ||||||
|  |         self.assertEqual(cleaned, expected) | ||||||
|  | 
 | ||||||
|  |     def test_html_5(self): | ||||||
|  |         text = "<div>\n Hello, \t\n<strong> world    </strong>\n\t</div>" | ||||||
|  |         cleaned = clean( | ||||||
|  |             text, | ||||||
|  |             clean_html=True, | ||||||
|  |             lowercase=True, | ||||||
|  |             compress_whitespace=True, | ||||||
|  |             strip=True, | ||||||
|  |             remove_punctuation=True | ||||||
|  |         ) | ||||||
|  |         expected = "hello world" | ||||||
|  | 
 | ||||||
|  |         self.assertEqual(cleaned, expected) | ||||||
|  | 
 | ||||||
|  |     def test_html_6(self): | ||||||
|  |         text = "<div>\n Hello, \t\n<strong>a the world    </strong>\n\t</div>" | ||||||
|  |         cleaned = clean( | ||||||
|  |             text, | ||||||
|  |             clean_html=True, | ||||||
|  |             lowercase=True, | ||||||
|  |             remove_punctuation=True, | ||||||
|  |             strip=True, | ||||||
|  |             remove_stopwords_en=True | ||||||
|  |         ) | ||||||
|  |         expected = "hello world" | ||||||
|  | 
 | ||||||
|  |         self.assertEqual(cleaned, expected) | ||||||
|  | 
 | ||||||
|  |     def test_html_7(self): | ||||||
|  |         text = "<div>\n Hello, \t\n<strong>a the worlds    </strong>\n\t</div>" | ||||||
|  |         cleaned = clean( | ||||||
|  |             text, | ||||||
|  |             clean_html=True, | ||||||
|  |             lowercase=True, | ||||||
|  |             remove_punctuation=True, | ||||||
|  |             strip=True, | ||||||
|  |             remove_stopwords_en=True, | ||||||
|  |             lemmatize=True | ||||||
|  |         ) | ||||||
|  |         expected = "hello world" | ||||||
|  | 
 | ||||||
|  |         self.assertEqual(cleaned, expected) | ||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user