improve text cleaning

This commit is contained in:
simon987 2021-04-18 21:27:12 -04:00
parent 2ffaa4a5b3
commit 32119535ae
2 changed files with 18 additions and 1 deletions

View File

@ -43,7 +43,7 @@ def clean(text, lowercase=False, clean_html=False, strip=False, remove_punctuati
if clean_html: if clean_html:
try: try:
root = etree.fromstring(text) root = etree.fromstring(text.replace("&", ""))
text = "".join(get_text(root)) text = "".join(get_text(root))
except: except:
pass pass

View File

@ -146,3 +146,20 @@ class TestText(TestCase):
expected = "hello world it's it's" expected = "hello world it's it's"
self.assertEqual(cleaned, expected) self.assertEqual(cleaned, expected)
def test_html_11(self):
text = "<div>\n Hello, \t\n<strong>world! it's it`s u & | </strong>\n\t</div>"
cleaned = clean(
text,
clean_html=True,
lowercase=True,
remove_punctuation=True,
strip=True,
lemmatize=True,
fix_single_quotes=True,
remove_stopwords_en=True,
remove_urls=True
)
expected = "hello world"
self.assertEqual(cleaned, expected)