mirror of
https://github.com/simon987/hexlib.git
synced 2025-04-24 12:05:50 +00:00
improve text cleaning
This commit is contained in:
parent
2ffaa4a5b3
commit
32119535ae
@ -43,7 +43,7 @@ def clean(text, lowercase=False, clean_html=False, strip=False, remove_punctuati
|
||||
|
||||
if clean_html:
|
||||
try:
|
||||
root = etree.fromstring(text)
|
||||
root = etree.fromstring(text.replace("&", ""))
|
||||
text = "".join(get_text(root))
|
||||
except:
|
||||
pass
|
||||
|
@ -146,3 +146,20 @@ class TestText(TestCase):
|
||||
expected = "hello world it's it's"
|
||||
|
||||
self.assertEqual(cleaned, expected)
|
||||
|
||||
def test_html_11(self):
|
||||
text = "<div>\n Hello, \t\n<strong>world! it's it`s u & | </strong>\n\t</div>"
|
||||
cleaned = clean(
|
||||
text,
|
||||
clean_html=True,
|
||||
lowercase=True,
|
||||
remove_punctuation=True,
|
||||
strip=True,
|
||||
lemmatize=True,
|
||||
fix_single_quotes=True,
|
||||
remove_stopwords_en=True,
|
||||
remove_urls=True
|
||||
)
|
||||
expected = "hello world"
|
||||
|
||||
self.assertEqual(cleaned, expected)
|
||||
|
Loading…
x
Reference in New Issue
Block a user