improve text cleaning

This commit is contained in:
2021-04-18 21:10:07 -04:00
parent 067a20f7a8
commit 2ffaa4a5b3
3 changed files with 35 additions and 11 deletions

View File

@@ -42,7 +42,6 @@ class TestText(TestCase):
text,
clean_html=True,
lowercase=True,
compress_whitespace=True
)
expected = " hello, world "
@@ -54,7 +53,6 @@ class TestText(TestCase):
text,
clean_html=True,
lowercase=True,
compress_whitespace=True,
strip=True
)
expected = "hello, world"
@@ -67,7 +65,6 @@ class TestText(TestCase):
text,
clean_html=True,
lowercase=True,
compress_whitespace=True,
strip=True,
remove_punctuation=True
)
@@ -133,3 +130,19 @@ class TestText(TestCase):
expected = "hello world it's it's"
self.assertEqual(cleaned, expected)
def test_html_10(self):
text = "<div>\n Hello, \t\n<strong>world! it's it`s https://google.ca/test/abc.pdf </strong>\n\t</div>"
cleaned = clean(
text,
clean_html=True,
lowercase=True,
remove_punctuation=True,
strip=True,
lemmatize=True,
fix_single_quotes=True,
remove_urls=True
)
expected = "hello world it's it's"
self.assertEqual(cleaned, expected)