from unittest import TestCase from hexlib.text import preprocess class TestText(TestCase): def test_html_invalid(self): text = "" cleaned = preprocess( text, clean_html=True, ) expected = "" self.assertEqual(" ".join(cleaned), expected) def test_html_1(self): text = "
Hello, world
" cleaned = preprocess( text, clean_html=True, ) expected = "Hello, world" self.assertEqual(" ".join(cleaned), expected) def test_html_2(self): text = "
Hello, world
" cleaned = preprocess( text, clean_html=True, lowercase=True ) expected = "hello, world" self.assertEqual(" ".join(cleaned), expected) def test_html_4(self): text = "
\n Hello, \t\n world \n\t
" cleaned = preprocess( text, clean_html=True, lowercase=True, ) expected = "hello, world" self.assertEqual(" ".join(cleaned), expected) def test_html_5(self): text = "
\n Hello, \t\n world \n\t
" cleaned = preprocess( text, clean_html=True, lowercase=True, remove_punctuation=True ) expected = "hello world" self.assertEqual(" ".join(cleaned), expected) def test_html_6(self): text = "
\n Hello, \t\na the world \n\t
" cleaned = preprocess( text, clean_html=True, lowercase=True, remove_punctuation=True, remove_stopwords_en=True ) expected = "hello world" self.assertEqual(" ".join(cleaned), expected) def test_html_7(self): text = "
\n Hello, \t\na the worlds \n\t
" cleaned = preprocess( text, clean_html=True, lowercase=True, remove_punctuation=True, remove_stopwords_en=True, lemmatize=True ) expected = "hello world" self.assertEqual(" ".join(cleaned), expected) def test_html_8(self): text = "
\n Hello, \t\na the worlds! \n\t
" cleaned = preprocess( text, clean_html=True, lowercase=True, remove_punctuation=True, remove_stopwords_en=True, lemmatize=True ) expected = "hello world" self.assertEqual(" ".join(cleaned), expected) def test_html_9(self): text = "
\n Hello, \t\nworld! it's it`s \n\t
" cleaned = preprocess( text, clean_html=True, lowercase=True, remove_punctuation=True, lemmatize=True, fix_single_quotes=True ) expected = "hello world it's it's" self.assertEqual(" ".join(cleaned), expected) def test_single_quote(self): text = "it's it`s it’s" cleaned = preprocess( text, lowercase=True, fix_single_quotes=True ) expected = "it's it's it's" self.assertEqual(" ".join(cleaned), expected) def test_html_10(self): text = "
\n Hello, \t\nworld! it's it`s https://google.ca/test/abc.pdf \n\t
" cleaned = preprocess( text, clean_html=True, lowercase=True, remove_punctuation=True, lemmatize=True, fix_single_quotes=True, remove_urls=True ) expected = "hello world it's it's" self.assertEqual(" ".join(cleaned), expected) def test_html_11(self): text = "
\n Hello, \t\nworld! it's it`s & | \n\t
" cleaned = preprocess( text, clean_html=True, lowercase=True, remove_punctuation=True, lemmatize=True, fix_single_quotes=True, remove_stopwords_en=True, remove_urls=True ) expected = "hello world |" self.assertEqual(" ".join(cleaned), expected) def test_html_no_root(self): text = ">>217709510
Is there aservant that is against civilization and humanity?
Literally instant summon." cleaned = preprocess( text, clean_html=True, lowercase=True, remove_punctuation=True, lemmatize=False, fix_single_quotes=True, remove_stopwords_en=False, remove_urls=False ) expected = ">>217709510 is there a servant that is against civilization and humanity literally instant summon" self.assertEqual(" ".join(cleaned), expected) def test_html_entity(self): text = "doesn't" cleaned = preprocess( text, clean_html=True, lowercase=True, remove_punctuation=True, lemmatize=False, fix_single_quotes=True, remove_stopwords_en=False, remove_urls=False ) expected = "doesn't" self.assertEqual(" ".join(cleaned), expected) def test_html_invalid_attribute(self): text = '' cleaned = preprocess( text, clean_html=True, lowercase=True, remove_punctuation=True, lemmatize=False, fix_single_quotes=True, remove_stopwords_en=False, remove_urls=False ) expected = "" self.assertEqual(" ".join(cleaned), expected) def test_bigrams(self): text = "x A b c d e f g h" cleaned = preprocess( text, lowercase=True, bigrams={ ("a", "b"), ("c", "d"), ("f", "g"), } ) expected = "x a_b c_d e f_g h" self.assertEqual(" ".join(cleaned), expected) def test_trigrams(self): text = "x A b c d e f g h" cleaned = preprocess( text, lowercase=True, trigrams={ ("a", "b", "c"), ("e", "f", "g"), } ) expected = "x a_b_c d e_f_g h" self.assertEqual(" ".join(cleaned), expected) def test_remove_numbers(self): text = "Hello1 test1124test 12 1 1111111 world" cleaned = preprocess( text, lowercase=True, remove_numbers=True ) expected = "hello1 test1124test world" self.assertEqual(" ".join(cleaned), expected) def test_strip_quotes(self): text = "'hi' “test” 'hello\"" cleaned = preprocess( text, strip_quotes=True ) expected = "hi test hello" self.assertEqual(" ".join(cleaned), expected) def test_strip_dashes(self): text = "yes -But something-something -- hello aa--bb" cleaned = preprocess( text, strip_dashes=True ) expected = "yes But something-something hello aa-bb" self.assertEqual(" ".join(cleaned), expected) def test_word_tokenize(self): text = "i cannot believe'" cleaned = preprocess( text, use_nltk_tokenizer=True ) expected = "i can not believe '" self.assertEqual(" ".join(cleaned), expected)