From 2ffaa4a5b320a447a0c2fb5877aa636c786ef980 Mon Sep 17 00:00:00 2001 From: simon987 Date: Sun, 18 Apr 2021 21:10:07 -0400 Subject: [PATCH] improve text cleaning --- hexlib/regex.py | 2 +- hexlib/text.py | 25 ++++++++++++++++++------- test/test_text.py | 19 ++++++++++++++++--- 3 files changed, 35 insertions(+), 11 deletions(-) diff --git a/hexlib/regex.py b/hexlib/regex.py index 861b403..c4ce1c1 100644 --- a/hexlib/regex.py +++ b/hexlib/regex.py @@ -3,4 +3,4 @@ import re LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))") HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"") WHITESPACE_RE = re.compile(r"\s+") -PUNCTUATION_RE = re.compile(r"[.,;:\"!?/]+") +PUNCTUATION_RE = re.compile(r"[.,;:\"!?/()|*=]+") diff --git a/hexlib/text.py b/hexlib/text.py index 36f1fba..01e3f6e 100644 --- a/hexlib/text.py +++ b/hexlib/text.py @@ -6,12 +6,18 @@ from lxml import etree from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer -from .regex import WHITESPACE_RE, PUNCTUATION_RE +from .regex import WHITESPACE_RE, PUNCTUATION_RE, LINK_RE get_text = etree.XPath("//text()") stop_words_en = set(stopwords.words("english")) +extra_stop_words_en = [ + "u", "&", "-", "--" +] + +stop_words_en.update(extra_stop_words_en) + nltk.download("stopwords", quiet=True) nltk.download("wordnet", quiet=True) @@ -26,14 +32,15 @@ def clean_multicore(texts, processes, **kwargs): ) -def clean(text, compress_whitespace=False, lowercase=False, clean_html=False, strip=False, remove_punctuation=False, - remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False): - if compress_whitespace and remove_stopwords_en: - raise ValueError("Redundant flags: remove_stopwords implies compress_whitespace") - +def clean(text, lowercase=False, clean_html=False, strip=False, remove_punctuation=False, + remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False, + remove_urls=False): if fix_single_quotes: text = text.replace("`", "'") + if remove_urls: + text = LINK_RE.sub(" ", text) + if clean_html: try: root = etree.fromstring(text) @@ -47,9 +54,13 @@ def clean(text, compress_whitespace=False, lowercase=False, clean_html=False, st if lowercase: text = text.lower() - if compress_whitespace: + if not remove_stopwords_en or not lemmatize or not strip_quotes: text = WHITESPACE_RE.sub(" ", text) + if strip_quotes: + words = WHITESPACE_RE.split(text) + text = " ".join(w.strip("\"'") for w in words) + if remove_stopwords_en or lemmatize: words = WHITESPACE_RE.split(text) diff --git a/test/test_text.py b/test/test_text.py index 18eeb1c..f3555e9 100644 --- a/test/test_text.py +++ b/test/test_text.py @@ -42,7 +42,6 @@ class TestText(TestCase): text, clean_html=True, lowercase=True, - compress_whitespace=True ) expected = " hello, world " @@ -54,7 +53,6 @@ class TestText(TestCase): text, clean_html=True, lowercase=True, - compress_whitespace=True, strip=True ) expected = "hello, world" @@ -67,7 +65,6 @@ class TestText(TestCase): text, clean_html=True, lowercase=True, - compress_whitespace=True, strip=True, remove_punctuation=True ) @@ -133,3 +130,19 @@ class TestText(TestCase): expected = "hello world it's it's" self.assertEqual(cleaned, expected) + + def test_html_10(self): + text = "
\n Hello, \t\nworld! it's it`s https://google.ca/test/abc.pdf \n\t
" + cleaned = clean( + text, + clean_html=True, + lowercase=True, + remove_punctuation=True, + strip=True, + lemmatize=True, + fix_single_quotes=True, + remove_urls=True + ) + expected = "hello world it's it's" + + self.assertEqual(cleaned, expected)