diff --git a/hexlib/regex.py b/hexlib/regex.py
index c4ce1c1..c10465e 100644
--- a/hexlib/regex.py
+++ b/hexlib/regex.py
@@ -4,3 +4,4 @@ LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))")
 HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"")
 WHITESPACE_RE = re.compile(r"\s+")
 PUNCTUATION_RE = re.compile(r"[.,;:\"!?/()|*=]+")
+XML_ENTITY_RE = re.compile(r"&[a-z]+;")
diff --git a/hexlib/text.py b/hexlib/text.py
index 52c901b..e081299 100644
--- a/hexlib/text.py
+++ b/hexlib/text.py
@@ -6,7 +6,7 @@ from lxml import etree
 from nltk.corpus import stopwords
 from nltk.stem import WordNetLemmatizer
 
-from .regex import WHITESPACE_RE, PUNCTUATION_RE, LINK_RE
+from .regex import WHITESPACE_RE, PUNCTUATION_RE, LINK_RE, XML_ENTITY_RE
 
 get_text = etree.XPath("//text()")
 
@@ -56,9 +56,16 @@ def preprocess(text, lowercase=False, clean_html=False, strip=False, remove_punc
 
     if clean_html:
         try:
-            root = etree.fromstring(text.replace("&", ""))
-            text = "".join(get_text(root))
-        except:
+            text = XML_ENTITY_RE.sub(" ", text)
+            text = text.replace("&", " ")
+            text = text.replace("<br>", "<br/>")
+            text = "<root>" + text + "</root>"
+
+            root = etree.fromstring(text)
+
+            text = " ".join(get_text(root))
+        except Exception as e:
+            raise e
             pass
 
     if remove_punctuation:
diff --git a/test/test_text.py b/test/test_text.py
index 441e84b..11c9693 100644
--- a/test/test_text.py
+++ b/test/test_text.py
@@ -164,6 +164,25 @@ class TestText(TestCase):
 
         self.assertEqual(cleaned, expected)
 
+    def test_html_no_root(self):
+        text = "<a href=\"#p217709510\" class=\"quotelink\">&gt;&gt;217709510</a><br>Is there a servant that is against civilization and humanity?<br>Literally instant summon."
+        
+        cleaned = preprocess(
+            text,
+            clean_html=True,
+            lowercase=True,
+            remove_punctuation=True,
+            strip=True,
+            lemmatize=False,
+            fix_single_quotes=True,
+            remove_stopwords_en=False,
+            remove_urls=False
+        )
+        
+        expected = "217709510 is there a servant that is against civilization and humanity literally instant summon"
+
+        self.assertEqual(cleaned, expected)
+
     def test_bigrams(self):
         text = "x A b c d e f g h"
         cleaned = preprocess(