From a7bf5b2d15054600191fd79953bb58d60549e1cf Mon Sep 17 00:00:00 2001 From: simon987 Date: Sat, 28 Aug 2021 19:59:04 -0400 Subject: [PATCH] Fix clean html (again!) --- hexlib/regex.py | 2 +- hexlib/text.py | 2 ++ setup.py | 2 +- test/test_text.py | 24 +++++++++++++++++++++--- 4 files changed, 25 insertions(+), 5 deletions(-) diff --git a/hexlib/regex.py b/hexlib/regex.py index c10465e..b579974 100644 --- a/hexlib/regex.py +++ b/hexlib/regex.py @@ -3,5 +3,5 @@ import re LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))") HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"") WHITESPACE_RE = re.compile(r"\s+") -PUNCTUATION_RE = re.compile(r"[.,;:\"!?/()|*=]+") +PUNCTUATION_RE = re.compile(r"[.,;:\"!?/()|*=>]+") XML_ENTITY_RE = re.compile(r"&[a-z]+;") diff --git a/hexlib/text.py b/hexlib/text.py index ca10b44..0c8f495 100644 --- a/hexlib/text.py +++ b/hexlib/text.py @@ -56,6 +56,8 @@ def preprocess(text, lowercase=False, clean_html=False, strip=False, remove_punc if clean_html: try: + text = "" + text + "" + parser = etree.XMLParser(recover=True) root = etree.fromstring(text, parser) diff --git a/setup.py b/setup.py index 7420104..d3c9c75 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup setup( name="hexlib", - version="1.46", + version="1.47", description="Misc utility methods", author="simon987", author_email="me@simon987.net", diff --git a/test/test_text.py b/test/test_text.py index 5f8ea94..43afb6d 100644 --- a/test/test_text.py +++ b/test/test_text.py @@ -166,7 +166,7 @@ class TestText(TestCase): def test_html_no_root(self): text = ">>217709510
Is there aservant that is against civilization and humanity?
Literally instant summon." - + cleaned = preprocess( text, clean_html=True, @@ -178,9 +178,27 @@ class TestText(TestCase): remove_stopwords_en=False, remove_urls=False ) - + expected = "217709510 is there a servant that is against civilization and humanity literally instant summon" - + self.assertEqual(cleaned, expected) + + def test_html_entity(self): + text = "doesn't" + + cleaned = preprocess( + text, + clean_html=True, + lowercase=True, + remove_punctuation=True, + strip=True, + lemmatize=False, + fix_single_quotes=True, + remove_stopwords_en=False, + remove_urls=False + ) + + expected = "doesn't" + self.assertEqual(cleaned, expected) def test_html_invalid_attribute(self): text = ''