Fix clean html (again)

This commit is contained in:
simon987 2021-08-28 19:44:10 -04:00
parent 4cff343370
commit 31b35e3a32
3 changed files with 23 additions and 10 deletions

View File

@ -56,16 +56,11 @@ def preprocess(text, lowercase=False, clean_html=False, strip=False, remove_punc
if clean_html:
try:
text = XML_ENTITY_RE.sub(" ", text)
text = text.replace("&", " ")
text = text.replace("<br>", "<br/>")
text = "<root>" + text + "</root>"
root = etree.fromstring(text)
parser = etree.XMLParser(recover=True)
root = etree.fromstring(text, parser)
text = " ".join(get_text(root))
except Exception as e:
raise e
except:
pass
if remove_punctuation:

View File

@ -2,7 +2,7 @@ from setuptools import setup
setup(
name="hexlib",
version="1.45",
version="1.46",
description="Misc utility methods",
author="simon987",
author_email="me@simon987.net",

View File

@ -165,7 +165,7 @@ class TestText(TestCase):
self.assertEqual(cleaned, expected)
def test_html_no_root(self):
text = "<a href=\"#p217709510\" class=\"quotelink\">&gt;&gt;217709510</a><br>Is there a servant that is against civilization and humanity?<br>Literally instant summon."
text = "<a href=\"#p217709510\" class=\"quotelink\">&gt;&gt;217709510</a><br>Is there a<wbr>servant that is against civilization and humanity?<br>Literally instant summon."
cleaned = preprocess(
text,
@ -180,6 +180,24 @@ class TestText(TestCase):
)
expected = "217709510 is there a servant that is against civilization and humanity literally instant summon"
def test_html_invalid_attribute(self):
text = '<root><iframe width="560" height="315" src=" " title="youtube video player" frameborder="0" allowfullscreen></iframe></root>'
cleaned = preprocess(
text,
clean_html=True,
lowercase=True,
remove_punctuation=True,
strip=True,
lemmatize=False,
fix_single_quotes=True,
remove_stopwords_en=False,
remove_urls=False
)
expected = ""
self.assertEqual(cleaned, expected)