mirror of
				https://github.com/simon987/hexlib.git
				synced 2025-10-31 16:16:52 +00:00 
			
		
		
		
	improve text cleaning
This commit is contained in:
		
							parent
							
								
									18cd59fc4a
								
							
						
					
					
						commit
						45b5803c40
					
				| @ -3,4 +3,4 @@ import re | |||||||
| LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))") | LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))") | ||||||
| HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"") | HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"") | ||||||
| WHITESPACE_RE = re.compile(r"\s+") | WHITESPACE_RE = re.compile(r"\s+") | ||||||
| PUNCTUATION_RE = re.compile(r"[.,;:\"']+") | PUNCTUATION_RE = re.compile(r"[.,;:\"!?]+") | ||||||
|  | |||||||
| @ -1,3 +1,6 @@ | |||||||
|  | from functools import partial | ||||||
|  | from multiprocessing.pool import ThreadPool | ||||||
|  | 
 | ||||||
| import nltk.corpus | import nltk.corpus | ||||||
| from lxml import etree | from lxml import etree | ||||||
| from nltk.corpus import stopwords | from nltk.corpus import stopwords | ||||||
| @ -15,11 +18,22 @@ nltk.download("wordnet", quiet=True) | |||||||
| lemmatizer = WordNetLemmatizer() | lemmatizer = WordNetLemmatizer() | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def clean_multithread(texts, processes, **kwargs): | ||||||
|  |     pool = ThreadPool(processes=processes) | ||||||
|  |     return pool.map( | ||||||
|  |         func=partial(clean, **kwargs), | ||||||
|  |         iterable=texts, | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def clean(text, compress_whitespace=False, lowercase=False, clean_html=False, strip=False, remove_punctuation=False, | def clean(text, compress_whitespace=False, lowercase=False, clean_html=False, strip=False, remove_punctuation=False, | ||||||
|           remove_stopwords_en=False, lemmatize=False): |           remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False): | ||||||
|     if compress_whitespace and remove_stopwords_en: |     if compress_whitespace and remove_stopwords_en: | ||||||
|         raise ValueError("Redundant flags: remove_stopwords implies compress_whitespace") |         raise ValueError("Redundant flags: remove_stopwords implies compress_whitespace") | ||||||
| 
 | 
 | ||||||
|  |     if fix_single_quotes: | ||||||
|  |         text = text.replace("`", "'") | ||||||
|  | 
 | ||||||
|     if clean_html: |     if clean_html: | ||||||
|         try: |         try: | ||||||
|             root = etree.fromstring(text) |             root = etree.fromstring(text) | ||||||
| @ -27,6 +41,9 @@ def clean(text, compress_whitespace=False, lowercase=False, clean_html=False, st | |||||||
|         except: |         except: | ||||||
|             pass |             pass | ||||||
| 
 | 
 | ||||||
|  |     if remove_punctuation: | ||||||
|  |         text = PUNCTUATION_RE.sub(" ", text) | ||||||
|  | 
 | ||||||
|     if lowercase: |     if lowercase: | ||||||
|         text = text.lower() |         text = text.lower() | ||||||
| 
 | 
 | ||||||
| @ -36,9 +53,6 @@ def clean(text, compress_whitespace=False, lowercase=False, clean_html=False, st | |||||||
|     if strip: |     if strip: | ||||||
|         text = text.strip() |         text = text.strip() | ||||||
| 
 | 
 | ||||||
|     if remove_punctuation: |  | ||||||
|         text = PUNCTUATION_RE.sub("", text) |  | ||||||
| 
 |  | ||||||
|     if remove_stopwords_en or lemmatize: |     if remove_stopwords_en or lemmatize: | ||||||
|         words = WHITESPACE_RE.split(text) |         words = WHITESPACE_RE.split(text) | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -103,3 +103,33 @@ class TestText(TestCase): | |||||||
|         expected = "hello world" |         expected = "hello world" | ||||||
| 
 | 
 | ||||||
|         self.assertEqual(cleaned, expected) |         self.assertEqual(cleaned, expected) | ||||||
|  | 
 | ||||||
|  |     def test_html_8(self): | ||||||
|  |         text = "<div>\n Hello, \t\n<strong>a the worlds!    </strong>\n\t</div>" | ||||||
|  |         cleaned = clean( | ||||||
|  |             text, | ||||||
|  |             clean_html=True, | ||||||
|  |             lowercase=True, | ||||||
|  |             remove_punctuation=True, | ||||||
|  |             strip=True, | ||||||
|  |             remove_stopwords_en=True, | ||||||
|  |             lemmatize=True | ||||||
|  |         ) | ||||||
|  |         expected = "hello world" | ||||||
|  | 
 | ||||||
|  |         self.assertEqual(cleaned, expected) | ||||||
|  | 
 | ||||||
|  |     def test_html_9(self): | ||||||
|  |         text = "<div>\n Hello, \t\n<strong>world! it's it`s   </strong>\n\t</div>" | ||||||
|  |         cleaned = clean( | ||||||
|  |             text, | ||||||
|  |             clean_html=True, | ||||||
|  |             lowercase=True, | ||||||
|  |             remove_punctuation=True, | ||||||
|  |             strip=True, | ||||||
|  |             lemmatize=True, | ||||||
|  |             fix_single_quotes=True | ||||||
|  |         ) | ||||||
|  |         expected = "hello world it's it's" | ||||||
|  | 
 | ||||||
|  |         self.assertEqual(cleaned, expected) | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user