mirror of
https://github.com/simon987/hexlib.git
synced 2025-12-14 07:09:05 +00:00
Add option to use nltk word_tokenize
This commit is contained in:
@@ -267,3 +267,13 @@ class TestText(TestCase):
|
||||
expected = "yes But something-something hello aa-bb"
|
||||
|
||||
self.assertEqual(" ".join(cleaned), expected)
|
||||
|
||||
def test_word_tokenize(self):
|
||||
text = "i cannot believe'"
|
||||
cleaned = preprocess(
|
||||
text,
|
||||
use_nltk_tokenizer=True
|
||||
)
|
||||
expected = "i can not believe '"
|
||||
|
||||
self.assertEqual(" ".join(cleaned), expected)
|
||||
|
||||
Reference in New Issue
Block a user