diff --git a/hexlib/text.py b/hexlib/text.py index 01e3f6e..97847aa 100644 --- a/hexlib/text.py +++ b/hexlib/text.py @@ -43,7 +43,7 @@ def clean(text, lowercase=False, clean_html=False, strip=False, remove_punctuati if clean_html: try: - root = etree.fromstring(text) + root = etree.fromstring(text.replace("&", "")) text = "".join(get_text(root)) except: pass diff --git a/test/test_text.py b/test/test_text.py index f3555e9..ed9c9a6 100644 --- a/test/test_text.py +++ b/test/test_text.py @@ -146,3 +146,20 @@ class TestText(TestCase): expected = "hello world it's it's" self.assertEqual(cleaned, expected) + + def test_html_11(self): + text = "