mirror of
https://github.com/simon987/hexlib.git
synced 2025-12-13 14:49:05 +00:00
improve text cleaning
This commit is contained in:
@@ -42,7 +42,6 @@ class TestText(TestCase):
|
||||
text,
|
||||
clean_html=True,
|
||||
lowercase=True,
|
||||
compress_whitespace=True
|
||||
)
|
||||
expected = " hello, world "
|
||||
|
||||
@@ -54,7 +53,6 @@ class TestText(TestCase):
|
||||
text,
|
||||
clean_html=True,
|
||||
lowercase=True,
|
||||
compress_whitespace=True,
|
||||
strip=True
|
||||
)
|
||||
expected = "hello, world"
|
||||
@@ -67,7 +65,6 @@ class TestText(TestCase):
|
||||
text,
|
||||
clean_html=True,
|
||||
lowercase=True,
|
||||
compress_whitespace=True,
|
||||
strip=True,
|
||||
remove_punctuation=True
|
||||
)
|
||||
@@ -133,3 +130,19 @@ class TestText(TestCase):
|
||||
expected = "hello world it's it's"
|
||||
|
||||
self.assertEqual(cleaned, expected)
|
||||
|
||||
def test_html_10(self):
|
||||
text = "<div>\n Hello, \t\n<strong>world! it's it`s https://google.ca/test/abc.pdf </strong>\n\t</div>"
|
||||
cleaned = clean(
|
||||
text,
|
||||
clean_html=True,
|
||||
lowercase=True,
|
||||
remove_punctuation=True,
|
||||
strip=True,
|
||||
lemmatize=True,
|
||||
fix_single_quotes=True,
|
||||
remove_urls=True
|
||||
)
|
||||
expected = "hello world it's it's"
|
||||
|
||||
self.assertEqual(cleaned, expected)
|
||||
|
||||
Reference in New Issue
Block a user