From 31b35e3a32261efb7662c52ffe01afa97f111b0a Mon Sep 17 00:00:00 2001 From: simon987 Date: Sat, 28 Aug 2021 19:44:10 -0400 Subject: [PATCH] Fix clean html (again) --- hexlib/text.py | 11 +++-------- setup.py | 2 +- test/test_text.py | 20 +++++++++++++++++++- 3 files changed, 23 insertions(+), 10 deletions(-) diff --git a/hexlib/text.py b/hexlib/text.py index e081299..ca10b44 100644 --- a/hexlib/text.py +++ b/hexlib/text.py @@ -56,16 +56,11 @@ def preprocess(text, lowercase=False, clean_html=False, strip=False, remove_punc if clean_html: try: - text = XML_ENTITY_RE.sub(" ", text) - text = text.replace("&", " ") - text = text.replace("
", "
") - text = "" + text + "" - - root = etree.fromstring(text) + parser = etree.XMLParser(recover=True) + root = etree.fromstring(text, parser) text = " ".join(get_text(root)) - except Exception as e: - raise e + except: pass if remove_punctuation: diff --git a/setup.py b/setup.py index 037487b..7420104 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup setup( name="hexlib", - version="1.45", + version="1.46", description="Misc utility methods", author="simon987", author_email="me@simon987.net", diff --git a/test/test_text.py b/test/test_text.py index 11c9693..5f8ea94 100644 --- a/test/test_text.py +++ b/test/test_text.py @@ -165,7 +165,7 @@ class TestText(TestCase): self.assertEqual(cleaned, expected) def test_html_no_root(self): - text = ">>217709510
Is there a servant that is against civilization and humanity?
Literally instant summon." + text = ">>217709510
Is there aservant that is against civilization and humanity?
Literally instant summon." cleaned = preprocess( text, @@ -180,6 +180,24 @@ class TestText(TestCase): ) expected = "217709510 is there a servant that is against civilization and humanity literally instant summon" + + + def test_html_invalid_attribute(self): + text = '' + + cleaned = preprocess( + text, + clean_html=True, + lowercase=True, + remove_punctuation=True, + strip=True, + lemmatize=False, + fix_single_quotes=True, + remove_stopwords_en=False, + remove_urls=False + ) + + expected = "" self.assertEqual(cleaned, expected)