Add remove_numbers

This commit is contained in:
simon987 2021-08-28 20:06:53 -04:00
parent a7bf5b2d15
commit 67c09cc10c
3 changed files with 19 additions and 3 deletions

View File

@ -44,7 +44,7 @@ def _transform_bigram(ngram_seq, ngrams):
def preprocess(text, lowercase=False, clean_html=False, strip=False, remove_punctuation=False,
remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False,
remove_urls=False, bigrams: set = None):
remove_urls=False, bigrams: set = None, remove_numbers=False):
if lowercase:
text = text.lower()
@ -79,9 +79,14 @@ def preprocess(text, lowercase=False, clean_html=False, strip=False, remove_punc
words.append("*")
text = " ".join(_transform_bigram(nltk.bigrams(words), bigrams))
if remove_stopwords_en or lemmatize:
if remove_stopwords_en or lemmatize or remove_numbers:
words = text.split(" ")
if remove_numbers:
words = filter(lambda w: not w.isnumeric(), words)
if not lemmatize and not remove_stopwords_en:
text = " ".join(words)
if lemmatize and remove_stopwords_en:
text = " ".join(lemmatizer.lemmatize(w) for w in words if w not in stop_words_en)
elif not lemmatize and remove_stopwords_en:

View File

@ -2,7 +2,7 @@ from setuptools import setup
setup(
name="hexlib",
version="1.47",
version="1.48",
description="Misc utility methods",
author="simon987",
author_email="me@simon987.net",

View File

@ -233,3 +233,14 @@ class TestText(TestCase):
expected = "x a_b c_d e f_g h"
self.assertEqual(cleaned, expected)
def test_remove_numbers(self):
text = "Hello1 test1124test 12 1 1111111 world"
cleaned = preprocess(
text,
lowercase=True,
remove_numbers=True
)
expected = "hello1 test1124test world"
self.assertEqual(cleaned, expected)