From a7bf5b2d15054600191fd79953bb58d60549e1cf Mon Sep 17 00:00:00 2001
From: simon987 <me@simon987.net>
Date: Sat, 28 Aug 2021 19:59:04 -0400
Subject: [PATCH] Fix clean html (again!)

---
 hexlib/regex.py   |  2 +-
 hexlib/text.py    |  2 ++
 setup.py          |  2 +-
 test/test_text.py | 24 +++++++++++++++++++++---
 4 files changed, 25 insertions(+), 5 deletions(-)
diff --git a/hexlib/regex.py b/hexlib/regex.py
index c10465e..b579974 100644
--- a/hexlib/regex.py
+++ b/hexlib/regex.py
@@ -3,5 +3,5 @@ import re
 LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))")
 HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"")
 WHITESPACE_RE = re.compile(r"\s+")
-PUNCTUATION_RE = re.compile(r"[.,;:\"!?/()|*=]+")
+PUNCTUATION_RE = re.compile(r"[.,;:\"!?/()|*=>]+")
 XML_ENTITY_RE = re.compile(r"&[a-z]+;")
diff --git a/hexlib/text.py b/hexlib/text.py
index ca10b44..0c8f495 100644
--- a/hexlib/text.py
+++ b/hexlib/text.py
@@ -56,6 +56,8 @@ def preprocess(text, lowercase=False, clean_html=False, strip=False, remove_punc
 
     if clean_html:
         try:
+            text = "<root>" + text + "</root>"
+
             parser = etree.XMLParser(recover=True)
             root = etree.fromstring(text, parser)
 
diff --git a/setup.py b/setup.py
index 7420104..d3c9c75 100644
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@ from setuptools import setup
 
 setup(
     name="hexlib",
-    version="1.46",
+    version="1.47",
     description="Misc utility methods",
     author="simon987",
     author_email="me@simon987.net",
diff --git a/test/test_text.py b/test/test_text.py
index 5f8ea94..43afb6d 100644
--- a/test/test_text.py
+++ b/test/test_text.py
@@ -166,7 +166,7 @@ class TestText(TestCase):
 
     def test_html_no_root(self):
         text = "<a href=\"#p217709510\" class=\"quotelink\">&gt;&gt;217709510</a><br>Is there a<wbr>servant that is against civilization and humanity?<br>Literally instant summon."
-        
+
         cleaned = preprocess(
             text,
             clean_html=True,
@@ -178,9 +178,27 @@ class TestText(TestCase):
             remove_stopwords_en=False,
             remove_urls=False
         )
-        
+
         expected = "217709510 is there a servant that is against civilization and humanity literally instant summon"
-        
+        self.assertEqual(cleaned, expected)
+
+    def test_html_entity(self):
+        text = "doesn&#039;t"
+
+        cleaned = preprocess(
+            text,
+            clean_html=True,
+            lowercase=True,
+            remove_punctuation=True,
+            strip=True,
+            lemmatize=False,
+            fix_single_quotes=True,
+            remove_stopwords_en=False,
+            remove_urls=False
+        )
+
+        expected = "doesn't"
+        self.assertEqual(cleaned, expected)
 
     def test_html_invalid_attribute(self):
         text = '<root><iframe width="560" height="315" src=" " title="youtube video player" frameborder="0" allowfullscreen></iframe></root>'