mirror of
				https://github.com/simon987/hexlib.git
				synced 2025-11-04 01:26:52 +00:00 
			
		
		
		
	Add remove_numbers
This commit is contained in:
		
							parent
							
								
									a7bf5b2d15
								
							
						
					
					
						commit
						67c09cc10c
					
				@ -44,7 +44,7 @@ def _transform_bigram(ngram_seq, ngrams):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
def preprocess(text, lowercase=False, clean_html=False, strip=False, remove_punctuation=False,
 | 
					def preprocess(text, lowercase=False, clean_html=False, strip=False, remove_punctuation=False,
 | 
				
			||||||
               remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False,
 | 
					               remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False,
 | 
				
			||||||
               remove_urls=False, bigrams: set = None):
 | 
					               remove_urls=False, bigrams: set = None, remove_numbers=False):
 | 
				
			||||||
    if lowercase:
 | 
					    if lowercase:
 | 
				
			||||||
        text = text.lower()
 | 
					        text = text.lower()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -79,9 +79,14 @@ def preprocess(text, lowercase=False, clean_html=False, strip=False, remove_punc
 | 
				
			|||||||
        words.append("*")
 | 
					        words.append("*")
 | 
				
			||||||
        text = " ".join(_transform_bigram(nltk.bigrams(words), bigrams))
 | 
					        text = " ".join(_transform_bigram(nltk.bigrams(words), bigrams))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if remove_stopwords_en or lemmatize:
 | 
					    if remove_stopwords_en or lemmatize or remove_numbers:
 | 
				
			||||||
        words = text.split(" ")
 | 
					        words = text.split(" ")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if remove_numbers:
 | 
				
			||||||
 | 
					            words = filter(lambda w: not w.isnumeric(), words)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if not lemmatize and not remove_stopwords_en:
 | 
				
			||||||
 | 
					            text = " ".join(words)
 | 
				
			||||||
        if lemmatize and remove_stopwords_en:
 | 
					        if lemmatize and remove_stopwords_en:
 | 
				
			||||||
            text = " ".join(lemmatizer.lemmatize(w) for w in words if w not in stop_words_en)
 | 
					            text = " ".join(lemmatizer.lemmatize(w) for w in words if w not in stop_words_en)
 | 
				
			||||||
        elif not lemmatize and remove_stopwords_en:
 | 
					        elif not lemmatize and remove_stopwords_en:
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										2
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								setup.py
									
									
									
									
									
								
							@ -2,7 +2,7 @@ from setuptools import setup
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
setup(
 | 
					setup(
 | 
				
			||||||
    name="hexlib",
 | 
					    name="hexlib",
 | 
				
			||||||
    version="1.47",
 | 
					    version="1.48",
 | 
				
			||||||
    description="Misc utility methods",
 | 
					    description="Misc utility methods",
 | 
				
			||||||
    author="simon987",
 | 
					    author="simon987",
 | 
				
			||||||
    author_email="me@simon987.net",
 | 
					    author_email="me@simon987.net",
 | 
				
			||||||
 | 
				
			|||||||
@ -233,3 +233,14 @@ class TestText(TestCase):
 | 
				
			|||||||
        expected = "x a_b c_d e f_g h"
 | 
					        expected = "x a_b c_d e f_g h"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.assertEqual(cleaned, expected)
 | 
					        self.assertEqual(cleaned, expected)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def test_remove_numbers(self):
 | 
				
			||||||
 | 
					        text = "Hello1 test1124test 12 1 1111111 world"
 | 
				
			||||||
 | 
					        cleaned = preprocess(
 | 
				
			||||||
 | 
					            text,
 | 
				
			||||||
 | 
					            lowercase=True,
 | 
				
			||||||
 | 
					            remove_numbers=True
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        expected = "hello1 test1124test world"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        self.assertEqual(cleaned, expected)
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user