From 67c09cc10c90b5190a231ce6f2e8472c89d1489f Mon Sep 17 00:00:00 2001 From: simon987 Date: Sat, 28 Aug 2021 20:06:53 -0400 Subject: [PATCH] Add remove_numbers --- hexlib/text.py | 9 +++++++-- setup.py | 2 +- test/test_text.py | 11 +++++++++++ 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/hexlib/text.py b/hexlib/text.py index 0c8f495..7bbcbc5 100644 --- a/hexlib/text.py +++ b/hexlib/text.py @@ -44,7 +44,7 @@ def _transform_bigram(ngram_seq, ngrams): def preprocess(text, lowercase=False, clean_html=False, strip=False, remove_punctuation=False, remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False, - remove_urls=False, bigrams: set = None): + remove_urls=False, bigrams: set = None, remove_numbers=False): if lowercase: text = text.lower() @@ -79,9 +79,14 @@ def preprocess(text, lowercase=False, clean_html=False, strip=False, remove_punc words.append("*") text = " ".join(_transform_bigram(nltk.bigrams(words), bigrams)) - if remove_stopwords_en or lemmatize: + if remove_stopwords_en or lemmatize or remove_numbers: words = text.split(" ") + if remove_numbers: + words = filter(lambda w: not w.isnumeric(), words) + + if not lemmatize and not remove_stopwords_en: + text = " ".join(words) if lemmatize and remove_stopwords_en: text = " ".join(lemmatizer.lemmatize(w) for w in words if w not in stop_words_en) elif not lemmatize and remove_stopwords_en: diff --git a/setup.py b/setup.py index d3c9c75..d4ad03b 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup setup( name="hexlib", - version="1.47", + version="1.48", description="Misc utility methods", author="simon987", author_email="me@simon987.net", diff --git a/test/test_text.py b/test/test_text.py index 43afb6d..5d698f0 100644 --- a/test/test_text.py +++ b/test/test_text.py @@ -233,3 +233,14 @@ class TestText(TestCase): expected = "x a_b c_d e f_g h" self.assertEqual(cleaned, expected) + + def test_remove_numbers(self): + text = "Hello1 test1124test 12 1 1111111 world" + cleaned = preprocess( + text, + lowercase=True, + remove_numbers=True + ) + expected = "hello1 test1124test world" + + self.assertEqual(cleaned, expected)