mirror of
https://github.com/simon987/hexlib.git
synced 2025-12-14 07:09:05 +00:00
Add text cleaning function
This commit is contained in:
105
test/test_text.py
Normal file
105
test/test_text.py
Normal file
@@ -0,0 +1,105 @@
|
||||
from unittest import TestCase
|
||||
|
||||
from hexlib.text import clean
|
||||
|
||||
|
||||
class TestText(TestCase):
|
||||
|
||||
def test_html_invalid(self):
|
||||
text = ""
|
||||
cleaned = clean(
|
||||
text,
|
||||
clean_html=True,
|
||||
)
|
||||
expected = ""
|
||||
|
||||
self.assertEqual(cleaned, expected)
|
||||
|
||||
def test_html_1(self):
|
||||
text = "<div>Hello, <strong>world</strong></div>"
|
||||
cleaned = clean(
|
||||
text,
|
||||
clean_html=True,
|
||||
)
|
||||
expected = "Hello, world"
|
||||
|
||||
self.assertEqual(cleaned, expected)
|
||||
|
||||
def test_html_2(self):
|
||||
text = "<div>Hello, <strong>world</strong></div>"
|
||||
cleaned = clean(
|
||||
text,
|
||||
clean_html=True,
|
||||
lowercase=True
|
||||
)
|
||||
expected = "hello, world"
|
||||
|
||||
self.assertEqual(cleaned, expected)
|
||||
|
||||
def test_html_3(self):
|
||||
text = "<div>\n Hello, \t\n<strong> world </strong>\n\t</div>"
|
||||
cleaned = clean(
|
||||
text,
|
||||
clean_html=True,
|
||||
lowercase=True,
|
||||
compress_whitespace=True
|
||||
)
|
||||
expected = " hello, world "
|
||||
|
||||
self.assertEqual(cleaned, expected)
|
||||
|
||||
def test_html_4(self):
|
||||
text = "<div>\n Hello, \t\n<strong> world </strong>\n\t</div>"
|
||||
cleaned = clean(
|
||||
text,
|
||||
clean_html=True,
|
||||
lowercase=True,
|
||||
compress_whitespace=True,
|
||||
strip=True
|
||||
)
|
||||
expected = "hello, world"
|
||||
|
||||
self.assertEqual(cleaned, expected)
|
||||
|
||||
def test_html_5(self):
|
||||
text = "<div>\n Hello, \t\n<strong> world </strong>\n\t</div>"
|
||||
cleaned = clean(
|
||||
text,
|
||||
clean_html=True,
|
||||
lowercase=True,
|
||||
compress_whitespace=True,
|
||||
strip=True,
|
||||
remove_punctuation=True
|
||||
)
|
||||
expected = "hello world"
|
||||
|
||||
self.assertEqual(cleaned, expected)
|
||||
|
||||
def test_html_6(self):
|
||||
text = "<div>\n Hello, \t\n<strong>a the world </strong>\n\t</div>"
|
||||
cleaned = clean(
|
||||
text,
|
||||
clean_html=True,
|
||||
lowercase=True,
|
||||
remove_punctuation=True,
|
||||
strip=True,
|
||||
remove_stopwords_en=True
|
||||
)
|
||||
expected = "hello world"
|
||||
|
||||
self.assertEqual(cleaned, expected)
|
||||
|
||||
def test_html_7(self):
|
||||
text = "<div>\n Hello, \t\n<strong>a the worlds </strong>\n\t</div>"
|
||||
cleaned = clean(
|
||||
text,
|
||||
clean_html=True,
|
||||
lowercase=True,
|
||||
remove_punctuation=True,
|
||||
strip=True,
|
||||
remove_stopwords_en=True,
|
||||
lemmatize=True
|
||||
)
|
||||
expected = "hello world"
|
||||
|
||||
self.assertEqual(cleaned, expected)
|
||||
Reference in New Issue
Block a user