diff --git a/hexlib/regex.py b/hexlib/regex.py index c4ce1c1..c10465e 100644 --- a/hexlib/regex.py +++ b/hexlib/regex.py @@ -4,3 +4,4 @@ LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))") HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"") WHITESPACE_RE = re.compile(r"\s+") PUNCTUATION_RE = re.compile(r"[.,;:\"!?/()|*=]+") +XML_ENTITY_RE = re.compile(r"&[a-z]+;") diff --git a/hexlib/text.py b/hexlib/text.py index 52c901b..e081299 100644 --- a/hexlib/text.py +++ b/hexlib/text.py @@ -6,7 +6,7 @@ from lxml import etree from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer -from .regex import WHITESPACE_RE, PUNCTUATION_RE, LINK_RE +from .regex import WHITESPACE_RE, PUNCTUATION_RE, LINK_RE, XML_ENTITY_RE get_text = etree.XPath("//text()") @@ -56,9 +56,16 @@ def preprocess(text, lowercase=False, clean_html=False, strip=False, remove_punc if clean_html: try: - root = etree.fromstring(text.replace("&", "")) - text = "".join(get_text(root)) - except: + text = XML_ENTITY_RE.sub(" ", text) + text = text.replace("&", " ") + text = text.replace("
", "
") + text = "" + text + "" + + root = etree.fromstring(text) + + text = " ".join(get_text(root)) + except Exception as e: + raise e pass if remove_punctuation: diff --git a/test/test_text.py b/test/test_text.py index 441e84b..11c9693 100644 --- a/test/test_text.py +++ b/test/test_text.py @@ -164,6 +164,25 @@ class TestText(TestCase): self.assertEqual(cleaned, expected) + def test_html_no_root(self): + text = ">>217709510
Is there a servant that is against civilization and humanity?
Literally instant summon." + + cleaned = preprocess( + text, + clean_html=True, + lowercase=True, + remove_punctuation=True, + strip=True, + lemmatize=False, + fix_single_quotes=True, + remove_stopwords_en=False, + remove_urls=False + ) + + expected = "217709510 is there a servant that is against civilization and humanity literally instant summon" + + self.assertEqual(cleaned, expected) + def test_bigrams(self): text = "x A b c d e f g h" cleaned = preprocess(