From 88f3124f85050a03bce7f53025c5b775fe8f94e5 Mon Sep 17 00:00:00 2001 From: simon987 Date: Wed, 21 Apr 2021 21:34:49 -0400 Subject: [PATCH] add bigram option for clean function --- hexlib/text.py | 23 +++++++++++++++++++---- test/test_text.py | 17 ++++++++++++++++- 2 files changed, 35 insertions(+), 5 deletions(-) diff --git a/hexlib/text.py b/hexlib/text.py index 97847aa..b781782 100644 --- a/hexlib/text.py +++ b/hexlib/text.py @@ -32,9 +32,22 @@ def clean_multicore(texts, processes, **kwargs): ) +def _transform_bigram(ngram_seq, ngrams): + for ngram in ngram_seq: + if ngram in ngrams: + yield "_".join(ngram) + + ngram_seq.__next__() + else: + yield ngram[0] + + def clean(text, lowercase=False, clean_html=False, strip=False, remove_punctuation=False, remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False, - remove_urls=False): + remove_urls=False, bigrams: set = None): + if lowercase: + text = text.lower() + if fix_single_quotes: text = text.replace("`", "'") @@ -51,9 +64,6 @@ def clean(text, lowercase=False, clean_html=False, strip=False, remove_punctuati if remove_punctuation: text = PUNCTUATION_RE.sub(" ", text) - if lowercase: - text = text.lower() - if not remove_stopwords_en or not lemmatize or not strip_quotes: text = WHITESPACE_RE.sub(" ", text) @@ -61,6 +71,11 @@ def clean(text, lowercase=False, clean_html=False, strip=False, remove_punctuati words = WHITESPACE_RE.split(text) text = " ".join(w.strip("\"'") for w in words) + if bigrams: + words = WHITESPACE_RE.split(text) + words.append("*") + text = " ".join(_transform_bigram(nltk.bigrams(words), bigrams)) + if remove_stopwords_en or lemmatize: words = WHITESPACE_RE.split(text) diff --git a/test/test_text.py b/test/test_text.py index ed9c9a6..334f456 100644 --- a/test/test_text.py +++ b/test/test_text.py @@ -148,7 +148,7 @@ class TestText(TestCase): self.assertEqual(cleaned, expected) def test_html_11(self): - text = "
\n Hello, \t\nworld! it's it`s u & | \n\t
" + text = "
\n Hello, \t\nworld! it's it`s u us & | \n\t
" cleaned = clean( text, clean_html=True, @@ -163,3 +163,18 @@ class TestText(TestCase): expected = "hello world" self.assertEqual(cleaned, expected) + + def test_bigrams(self): + text = "x A b c d e f g h" + cleaned = clean( + text, + lowercase=True, + bigrams={ + ("a", "b"), + ("c", "d"), + ("f", "g"), + } + ) + expected = "x a_b c_d e f_g h" + + self.assertEqual(cleaned, expected)