Fix clean_html

This commit is contained in:
2021-08-28 19:33:11 -04:00
parent db3e191983
commit 4d6c8018df
3 changed files with 31 additions and 4 deletions

View File

@@ -164,6 +164,25 @@ class TestText(TestCase):
self.assertEqual(cleaned, expected)
def test_html_no_root(self):
text = "<a href=\"#p217709510\" class=\"quotelink\">&gt;&gt;217709510</a><br>Is there a servant that is against civilization and humanity?<br>Literally instant summon."
cleaned = preprocess(
text,
clean_html=True,
lowercase=True,
remove_punctuation=True,
strip=True,
lemmatize=False,
fix_single_quotes=True,
remove_stopwords_en=False,
remove_urls=False
)
expected = "217709510 is there a servant that is against civilization and humanity literally instant summon"
self.assertEqual(cleaned, expected)
def test_bigrams(self):
text = "x A b c d e f g h"
cleaned = preprocess(