diff --git a/hexlib/regex.py b/hexlib/regex.py
index c4ce1c1..c10465e 100644
--- a/hexlib/regex.py
+++ b/hexlib/regex.py
@@ -4,3 +4,4 @@ LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))")
HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"")
WHITESPACE_RE = re.compile(r"\s+")
PUNCTUATION_RE = re.compile(r"[.,;:\"!?/()|*=]+")
+XML_ENTITY_RE = re.compile(r"&[a-z]+;")
diff --git a/hexlib/text.py b/hexlib/text.py
index 52c901b..e081299 100644
--- a/hexlib/text.py
+++ b/hexlib/text.py
@@ -6,7 +6,7 @@ from lxml import etree
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
-from .regex import WHITESPACE_RE, PUNCTUATION_RE, LINK_RE
+from .regex import WHITESPACE_RE, PUNCTUATION_RE, LINK_RE, XML_ENTITY_RE
get_text = etree.XPath("//text()")
@@ -56,9 +56,16 @@ def preprocess(text, lowercase=False, clean_html=False, strip=False, remove_punc
if clean_html:
try:
- root = etree.fromstring(text.replace("&", ""))
- text = "".join(get_text(root))
- except:
+ text = XML_ENTITY_RE.sub(" ", text)
+ text = text.replace("&", " ")
+ text = text.replace("
", "
")
+ text = "" + text + ""
+
+ root = etree.fromstring(text)
+
+ text = " ".join(get_text(root))
+ except Exception as e:
+ raise e
pass
if remove_punctuation:
diff --git a/test/test_text.py b/test/test_text.py
index 441e84b..11c9693 100644
--- a/test/test_text.py
+++ b/test/test_text.py
@@ -164,6 +164,25 @@ class TestText(TestCase):
self.assertEqual(cleaned, expected)
+ def test_html_no_root(self):
+ text = ">>217709510
Is there a servant that is against civilization and humanity?
Literally instant summon."
+
+ cleaned = preprocess(
+ text,
+ clean_html=True,
+ lowercase=True,
+ remove_punctuation=True,
+ strip=True,
+ lemmatize=False,
+ fix_single_quotes=True,
+ remove_stopwords_en=False,
+ remove_urls=False
+ )
+
+ expected = "217709510 is there a servant that is against civilization and humanity literally instant summon"
+
+ self.assertEqual(cleaned, expected)
+
def test_bigrams(self):
text = "x A b c d e f g h"
cleaned = preprocess(