diff --git a/hexlib/text.py b/hexlib/text.py index 01e3f6e..97847aa 100644 --- a/hexlib/text.py +++ b/hexlib/text.py @@ -43,7 +43,7 @@ def clean(text, lowercase=False, clean_html=False, strip=False, remove_punctuati if clean_html: try: - root = etree.fromstring(text) + root = etree.fromstring(text.replace("&", "")) text = "".join(get_text(root)) except: pass diff --git a/test/test_text.py b/test/test_text.py index f3555e9..ed9c9a6 100644 --- a/test/test_text.py +++ b/test/test_text.py @@ -146,3 +146,20 @@ class TestText(TestCase): expected = "hello world it's it's" self.assertEqual(cleaned, expected) + + def test_html_11(self): + text = "
\n Hello, \t\nworld! it's it`s u & | \n\t
" + cleaned = clean( + text, + clean_html=True, + lowercase=True, + remove_punctuation=True, + strip=True, + lemmatize=True, + fix_single_quotes=True, + remove_stopwords_en=True, + remove_urls=True + ) + expected = "hello world" + + self.assertEqual(cleaned, expected)