add bigram option for clean function

This commit is contained in:
2021-04-21 21:34:49 -04:00
parent 8edad0255b
commit 88f3124f85
2 changed files with 35 additions and 5 deletions

View File

@@ -148,7 +148,7 @@ class TestText(TestCase):
self.assertEqual(cleaned, expected)
def test_html_11(self):
text = "<div>\n Hello, \t\n<strong>world! it's it`s u & | </strong>\n\t</div>"
text = "<div>\n Hello, \t\n<strong>world! it's it`s u us & | </strong>\n\t</div>"
cleaned = clean(
text,
clean_html=True,
@@ -163,3 +163,18 @@ class TestText(TestCase):
expected = "hello world"
self.assertEqual(cleaned, expected)
def test_bigrams(self):
text = "x A b c d e f g h"
cleaned = clean(
text,
lowercase=True,
bigrams={
("a", "b"),
("c", "d"),
("f", "g"),
}
)
expected = "x a_b c_d e f_g h"
self.assertEqual(cleaned, expected)