1
0
mirror of https://github.com/simon987/hexlib.git synced 2025-04-24 12:05:50 +00:00

improve text cleaning

This commit is contained in:
simon987 2021-04-18 21:27:12 -04:00
parent 2ffaa4a5b3
commit 32119535ae
2 changed files with 18 additions and 1 deletions

@ -43,7 +43,7 @@ def clean(text, lowercase=False, clean_html=False, strip=False, remove_punctuati
if clean_html:
try:
root = etree.fromstring(text)
root = etree.fromstring(text.replace("&", ""))
text = "".join(get_text(root))
except:
pass

@ -146,3 +146,20 @@ class TestText(TestCase):
expected = "hello world it's it's"
self.assertEqual(cleaned, expected)
def test_html_11(self):
text = "<div>\n Hello, \t\n<strong>world! it's it`s u & | </strong>\n\t</div>"
cleaned = clean(
text,
clean_html=True,
lowercase=True,
remove_punctuation=True,
strip=True,
lemmatize=True,
fix_single_quotes=True,
remove_stopwords_en=True,
remove_urls=True
)
expected = "hello world"
self.assertEqual(cleaned, expected)