Add option to use nltk word_tokenize

This commit is contained in:
2023-09-09 11:11:44 -04:00
parent a047366926
commit b1a1da3bac
3 changed files with 19 additions and 3 deletions

View File

@@ -267,3 +267,13 @@ class TestText(TestCase):
expected = "yes But something-something hello aa-bb"
self.assertEqual(" ".join(cleaned), expected)
def test_word_tokenize(self):
text = "i cannot believe'"
cleaned = preprocess(
text,
use_nltk_tokenizer=True
)
expected = "i can not believe '"
self.assertEqual(" ".join(cleaned), expected)