mirror of
				https://github.com/simon987/hexlib.git
				synced 2025-10-25 22:16:53 +00:00 
			
		
		
		
	Add strip_dashes option in preprocess()
This commit is contained in:
		
							parent
							
								
									c9fac7151a
								
							
						
					
					
						commit
						4befc3973d
					
				| @ -1,3 +1,4 @@ | |||||||
|  | import re | ||||||
| from functools import partial | from functools import partial | ||||||
| from itertools import chain, repeat | from itertools import chain, repeat | ||||||
| from multiprocessing.pool import Pool | from multiprocessing.pool import Pool | ||||||
| @ -53,6 +54,8 @@ SINGLE_QUOTE_TRANS = str.maketrans("".join(SINGLE_QUOTES), "".join(repeat("'", l | |||||||
| DASHES = ("–", "⸺", "–", "—") | DASHES = ("–", "⸺", "–", "—") | ||||||
| DASHES_TRANS = str.maketrans("".join(DASHES), "".join(repeat("-", len(DASHES)))) | DASHES_TRANS = str.maketrans("".join(DASHES), "".join(repeat("-", len(DASHES)))) | ||||||
| 
 | 
 | ||||||
|  | DASHES_RE = re.compile(r"-+") | ||||||
|  | 
 | ||||||
| SPECIAL_PUNCTUATION = ";:\"/()|*=>" | SPECIAL_PUNCTUATION = ";:\"/()|*=>" | ||||||
| SPECIAL_PUNCTUATION_TRANS = str.maketrans(SPECIAL_PUNCTUATION, " " * len(SPECIAL_PUNCTUATION)) | SPECIAL_PUNCTUATION_TRANS = str.maketrans(SPECIAL_PUNCTUATION, " " * len(SPECIAL_PUNCTUATION)) | ||||||
| 
 | 
 | ||||||
| @ -62,6 +65,7 @@ PUNCTUATION_TRANS = str.maketrans(PUNCTUATION, " " * len(PUNCTUATION)) | |||||||
| 
 | 
 | ||||||
| def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False, remove_special_punctuation=False, | def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False, remove_special_punctuation=False, | ||||||
|                remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False, |                remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False, | ||||||
|  |                strip_dashes=False, | ||||||
|                remove_urls=False, bigrams: set = None, trigrams: set = None, remove_numbers=False): |                remove_urls=False, bigrams: set = None, trigrams: set = None, remove_numbers=False): | ||||||
|     if lowercase: |     if lowercase: | ||||||
|         text = text.lower() |         text = text.lower() | ||||||
| @ -71,6 +75,9 @@ def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False | |||||||
| 
 | 
 | ||||||
|     text = text.translate(DASHES_TRANS) |     text = text.translate(DASHES_TRANS) | ||||||
| 
 | 
 | ||||||
|  |     if strip_dashes: | ||||||
|  |         text = DASHES_RE.sub("-", text) | ||||||
|  | 
 | ||||||
|     if remove_urls: |     if remove_urls: | ||||||
|         text = LINK_RE.sub(" ", text) |         text = LINK_RE.sub(" ", text) | ||||||
| 
 | 
 | ||||||
| @ -96,6 +103,9 @@ def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False | |||||||
|     if strip_quotes: |     if strip_quotes: | ||||||
|         words = map(lambda w: w.strip("\"'“”"), words) |         words = map(lambda w: w.strip("\"'“”"), words) | ||||||
| 
 | 
 | ||||||
|  |     if strip_dashes: | ||||||
|  |         words = map(lambda w: w.strip("-"), words) | ||||||
|  | 
 | ||||||
|     if bigrams: |     if bigrams: | ||||||
|         words = _transform_bigram(nltk.bigrams(chain(words, ("*",))), bigrams) |         words = _transform_bigram(nltk.bigrams(chain(words, ("*",))), bigrams) | ||||||
| 
 | 
 | ||||||
|  | |||||||
							
								
								
									
										2
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								setup.py
									
									
									
									
									
								
							| @ -2,7 +2,7 @@ from setuptools import setup | |||||||
| 
 | 
 | ||||||
| setup( | setup( | ||||||
|     name="hexlib", |     name="hexlib", | ||||||
|     version="1.76", |     version="1.77", | ||||||
|     description="Misc utility methods", |     description="Misc utility methods", | ||||||
|     author="simon987", |     author="simon987", | ||||||
|     author_email="me@simon987.net", |     author_email="me@simon987.net", | ||||||
|  | |||||||
| @ -152,7 +152,7 @@ class TestText(TestCase): | |||||||
|             remove_stopwords_en=True, |             remove_stopwords_en=True, | ||||||
|             remove_urls=True |             remove_urls=True | ||||||
|         ) |         ) | ||||||
|         expected = "hello world" |         expected = "hello world |" | ||||||
| 
 | 
 | ||||||
|         self.assertEqual(" ".join(cleaned), expected) |         self.assertEqual(" ".join(cleaned), expected) | ||||||
| 
 | 
 | ||||||
| @ -170,7 +170,7 @@ class TestText(TestCase): | |||||||
|             remove_urls=False |             remove_urls=False | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
|         expected = "217709510 is there a servant that is against civilization and humanity literally instant summon" |         expected = ">>217709510 is there a servant that is against civilization and humanity literally instant summon" | ||||||
|         self.assertEqual(" ".join(cleaned), expected) |         self.assertEqual(" ".join(cleaned), expected) | ||||||
| 
 | 
 | ||||||
|     def test_html_entity(self): |     def test_html_entity(self): | ||||||
| @ -257,3 +257,13 @@ class TestText(TestCase): | |||||||
|         expected = "hi test hello" |         expected = "hi test hello" | ||||||
| 
 | 
 | ||||||
|         self.assertEqual(" ".join(cleaned), expected) |         self.assertEqual(" ".join(cleaned), expected) | ||||||
|  | 
 | ||||||
|  |     def test_strip_dashes(self): | ||||||
|  |         text = "yes -But something-something -- hello aa--bb" | ||||||
|  |         cleaned = preprocess( | ||||||
|  |             text, | ||||||
|  |             strip_dashes=True | ||||||
|  |         ) | ||||||
|  |         expected = "yes But something-something hello aa-bb" | ||||||
|  | 
 | ||||||
|  |         self.assertEqual(" ".join(cleaned), expected) | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user