diff --git a/hexlib/text.py b/hexlib/text.py
index e081299..ca10b44 100644
--- a/hexlib/text.py
+++ b/hexlib/text.py
@@ -56,16 +56,11 @@ def preprocess(text, lowercase=False, clean_html=False, strip=False, remove_punc
if clean_html:
try:
- text = XML_ENTITY_RE.sub(" ", text)
- text = text.replace("&", " ")
- text = text.replace("
", "
")
- text = "" + text + ""
-
- root = etree.fromstring(text)
+ parser = etree.XMLParser(recover=True)
+ root = etree.fromstring(text, parser)
text = " ".join(get_text(root))
- except Exception as e:
- raise e
+ except:
pass
if remove_punctuation:
diff --git a/setup.py b/setup.py
index 037487b..7420104 100644
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@ from setuptools import setup
setup(
name="hexlib",
- version="1.45",
+ version="1.46",
description="Misc utility methods",
author="simon987",
author_email="me@simon987.net",
diff --git a/test/test_text.py b/test/test_text.py
index 11c9693..5f8ea94 100644
--- a/test/test_text.py
+++ b/test/test_text.py
@@ -165,7 +165,7 @@ class TestText(TestCase):
self.assertEqual(cleaned, expected)
def test_html_no_root(self):
- text = ">>217709510
Is there a servant that is against civilization and humanity?
Literally instant summon."
+ text = ">>217709510
Is there aservant that is against civilization and humanity?
Literally instant summon."
cleaned = preprocess(
text,
@@ -180,6 +180,24 @@ class TestText(TestCase):
)
expected = "217709510 is there a servant that is against civilization and humanity literally instant summon"
+
+
+ def test_html_invalid_attribute(self):
+ text = ''
+
+ cleaned = preprocess(
+ text,
+ clean_html=True,
+ lowercase=True,
+ remove_punctuation=True,
+ strip=True,
+ lemmatize=False,
+ fix_single_quotes=True,
+ remove_stopwords_en=False,
+ remove_urls=False
+ )
+
+ expected = ""
self.assertEqual(cleaned, expected)