Fix clean html (again!)

2025-12-16 08:09:06 +00:00 · 2021-08-28 19:59:04 -04:00
parent 31b35e3a32
commit a7bf5b2d15
4 changed files with 25 additions and 5 deletions
--- a/hexlib/regex.py
+++ b/hexlib/regex.py
@@ -3,5 +3,5 @@ import re
 LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))")
 HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"")
 WHITESPACE_RE = re.compile(r"\s+")
-PUNCTUATION_RE = re.compile(r"[.,;:\"!?/()|*=]+")
+PUNCTUATION_RE = re.compile(r"[.,;:\"!?/()|*=>]+")
 XML_ENTITY_RE = re.compile(r"&[a-z]+;")
--- a/hexlib/text.py
+++ b/hexlib/text.py
@@ -56,6 +56,8 @@ def preprocess(text, lowercase=False, clean_html=False, strip=False, remove_punc

    if clean_html:
        try:
+            text = "<root>" + text + "</root>"
+
            parser = etree.XMLParser(recover=True)
            root = etree.fromstring(text, parser)

--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@ from setuptools import setup

 setup(
    name="hexlib",
-    version="1.46",
+    version="1.47",
    description="Misc utility methods",
    author="simon987",
    author_email="me@simon987.net",
--- a/test/test_text.py
+++ b/test/test_text.py
@@ -166,7 +166,7 @@ class TestText(TestCase):

    def test_html_no_root(self):
        text = "<a href=\"#p217709510\" class=\"quotelink\">&gt;&gt;217709510</a><br>Is there a<wbr>servant that is against civilization and humanity?<br>Literally instant summon."
-        
+
        cleaned = preprocess(
            text,
            clean_html=True,
@@ -178,9 +178,27 @@ class TestText(TestCase):
            remove_stopwords_en=False,
            remove_urls=False
        )
-        
+
        expected = "217709510 is there a servant that is against civilization and humanity literally instant summon"
-        
+        self.assertEqual(cleaned, expected)
+
+    def test_html_entity(self):
+        text = "doesn&#039;t"
+
+        cleaned = preprocess(
+            text,
+            clean_html=True,
+            lowercase=True,
+            remove_punctuation=True,
+            strip=True,
+            lemmatize=False,
+            fix_single_quotes=True,
+            remove_stopwords_en=False,
+            remove_urls=False
+        )
+
+        expected = "doesn't"
+        self.assertEqual(cleaned, expected)

    def test_html_invalid_attribute(self):
        text = '<root><iframe width="560" height="315" src=" " title="youtube video player" frameborder="0" allowfullscreen></iframe></root>'