From 45b5803c40a577c6824edd6f5a4332c8aa848375 Mon Sep 17 00:00:00 2001 From: simon987 Date: Sun, 18 Apr 2021 15:40:30 -0400 Subject: [PATCH] improve text cleaning --- hexlib/regex.py | 2 +- hexlib/text.py | 22 ++++++++++++++++++---- test/test_text.py | 30 ++++++++++++++++++++++++++++++ 3 files changed, 49 insertions(+), 5 deletions(-) diff --git a/hexlib/regex.py b/hexlib/regex.py index 348e023..9d79dc0 100644 --- a/hexlib/regex.py +++ b/hexlib/regex.py @@ -3,4 +3,4 @@ import re LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))") HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"") WHITESPACE_RE = re.compile(r"\s+") -PUNCTUATION_RE = re.compile(r"[.,;:\"']+") +PUNCTUATION_RE = re.compile(r"[.,;:\"!?]+") diff --git a/hexlib/text.py b/hexlib/text.py index 0b226b3..6851217 100644 --- a/hexlib/text.py +++ b/hexlib/text.py @@ -1,3 +1,6 @@ +from functools import partial +from multiprocessing.pool import ThreadPool + import nltk.corpus from lxml import etree from nltk.corpus import stopwords @@ -15,11 +18,22 @@ nltk.download("wordnet", quiet=True) lemmatizer = WordNetLemmatizer() +def clean_multithread(texts, processes, **kwargs): + pool = ThreadPool(processes=processes) + return pool.map( + func=partial(clean, **kwargs), + iterable=texts, + ) + + def clean(text, compress_whitespace=False, lowercase=False, clean_html=False, strip=False, remove_punctuation=False, - remove_stopwords_en=False, lemmatize=False): + remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False): if compress_whitespace and remove_stopwords_en: raise ValueError("Redundant flags: remove_stopwords implies compress_whitespace") + if fix_single_quotes: + text = text.replace("`", "'") + if clean_html: try: root = etree.fromstring(text) @@ -27,6 +41,9 @@ def clean(text, compress_whitespace=False, lowercase=False, clean_html=False, st except: pass + if remove_punctuation: + text = PUNCTUATION_RE.sub(" ", text) + if lowercase: text = text.lower() @@ -36,9 +53,6 @@ def clean(text, compress_whitespace=False, lowercase=False, clean_html=False, st if strip: text = text.strip() - if remove_punctuation: - text = PUNCTUATION_RE.sub("", text) - if remove_stopwords_en or lemmatize: words = WHITESPACE_RE.split(text) diff --git a/test/test_text.py b/test/test_text.py index abd1ef6..18eeb1c 100644 --- a/test/test_text.py +++ b/test/test_text.py @@ -103,3 +103,33 @@ class TestText(TestCase): expected = "hello world" self.assertEqual(cleaned, expected) + + def test_html_8(self): + text = "
\n Hello, \t\na the worlds! \n\t
" + cleaned = clean( + text, + clean_html=True, + lowercase=True, + remove_punctuation=True, + strip=True, + remove_stopwords_en=True, + lemmatize=True + ) + expected = "hello world" + + self.assertEqual(cleaned, expected) + + def test_html_9(self): + text = "
\n Hello, \t\nworld! it's it`s \n\t
" + cleaned = clean( + text, + clean_html=True, + lowercase=True, + remove_punctuation=True, + strip=True, + lemmatize=True, + fix_single_quotes=True + ) + expected = "hello world it's it's" + + self.assertEqual(cleaned, expected)