from unittest import TestCase from hexlib.text import preprocess class TestText(TestCase): def test_html_invalid(self): text = "" cleaned = preprocess( text, clean_html=True, ) expected = "" self.assertEqual(cleaned, expected) def test_html_1(self): text = "
Hello, world
" cleaned = preprocess( text, clean_html=True, ) expected = "Hello, world" self.assertEqual(cleaned, expected) def test_html_2(self): text = "
Hello, world
" cleaned = preprocess( text, clean_html=True, lowercase=True ) expected = "hello, world" self.assertEqual(cleaned, expected) def test_html_3(self): text = "
\n Hello, \t\n world \n\t
" cleaned = preprocess( text, clean_html=True, lowercase=True, ) expected = " hello, world " self.assertEqual(cleaned, expected) def test_html_4(self): text = "
\n Hello, \t\n world \n\t
" cleaned = preprocess( text, clean_html=True, lowercase=True, strip=True ) expected = "hello, world" self.assertEqual(cleaned, expected) def test_html_5(self): text = "
\n Hello, \t\n world \n\t
" cleaned = preprocess( text, clean_html=True, lowercase=True, strip=True, remove_punctuation=True ) expected = "hello world" self.assertEqual(cleaned, expected) def test_html_6(self): text = "
\n Hello, \t\na the world \n\t
" cleaned = preprocess( text, clean_html=True, lowercase=True, remove_punctuation=True, strip=True, remove_stopwords_en=True ) expected = "hello world" self.assertEqual(cleaned, expected) def test_html_7(self): text = "
\n Hello, \t\na the worlds \n\t
" cleaned = preprocess( text, clean_html=True, lowercase=True, remove_punctuation=True, strip=True, remove_stopwords_en=True, lemmatize=True ) expected = "hello world" self.assertEqual(cleaned, expected) def test_html_8(self): text = "
\n Hello, \t\na the worlds! \n\t
" cleaned = preprocess( text, clean_html=True, lowercase=True, remove_punctuation=True, strip=True, remove_stopwords_en=True, lemmatize=True ) expected = "hello world" self.assertEqual(cleaned, expected) def test_html_9(self): text = "
\n Hello, \t\nworld! it's it`s \n\t
" cleaned = preprocess( text, clean_html=True, lowercase=True, remove_punctuation=True, strip=True, lemmatize=True, fix_single_quotes=True ) expected = "hello world it's it's" self.assertEqual(cleaned, expected) def test_html_10(self): text = "
\n Hello, \t\nworld! it's it`s https://google.ca/test/abc.pdf \n\t
" cleaned = preprocess( text, clean_html=True, lowercase=True, remove_punctuation=True, strip=True, lemmatize=True, fix_single_quotes=True, remove_urls=True ) expected = "hello world it's it's" self.assertEqual(cleaned, expected) def test_html_11(self): text = "
\n Hello, \t\nworld! it's it`s & | \n\t
" cleaned = preprocess( text, clean_html=True, lowercase=True, remove_punctuation=True, strip=True, lemmatize=True, fix_single_quotes=True, remove_stopwords_en=True, remove_urls=True ) expected = "hello world" self.assertEqual(cleaned, expected) def test_bigrams(self): text = "x A b c d e f g h" cleaned = preprocess( text, lowercase=True, bigrams={ ("a", "b"), ("c", "d"), ("f", "g"), } ) expected = "x a_b c_d e f_g h" self.assertEqual(cleaned, expected)