Add option to use nltk word_tokenize

2025-12-14 07:09:05 +00:00 · 2023-09-09 11:11:44 -04:00
parent a047366926
commit b1a1da3bac
3 changed files with 19 additions and 3 deletions
--- a/test/test_text.py
+++ b/test/test_text.py
@@ -267,3 +267,13 @@ class TestText(TestCase):
        expected = "yes But something-something hello aa-bb"

        self.assertEqual(" ".join(cleaned), expected)
+
+    def test_word_tokenize(self):
+        text = "i cannot believe'"
+        cleaned = preprocess(
+            text,
+            use_nltk_tokenizer=True
+        )
+        expected = "i can not believe '"
+
+        self.assertEqual(" ".join(cleaned), expected)