Compare commits

...

5 Commits

Author SHA1 Message Date
32119535ae improve text cleaning 2021-04-18 21:27:12 -04:00
2ffaa4a5b3 improve text cleaning 2021-04-18 21:10:07 -04:00
067a20f7a8 improve text cleaning 2021-04-18 20:32:34 -04:00
00323ea576 improve text cleaning 2021-04-18 18:50:39 -04:00
45b5803c40 improve text cleaning 2021-04-18 15:40:30 -04:00
3 changed files with 101 additions and 16 deletions

View File

@ -3,4 +3,4 @@ import re
LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))") LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))")
HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"") HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"")
WHITESPACE_RE = re.compile(r"\s+") WHITESPACE_RE = re.compile(r"\s+")
PUNCTUATION_RE = re.compile(r"[.,;:\"']+") PUNCTUATION_RE = re.compile(r"[.,;:\"!?/()|*=]+")

View File

@ -1,43 +1,65 @@
from functools import partial
from multiprocessing.pool import Pool
import nltk.corpus import nltk.corpus
from lxml import etree from lxml import etree
from nltk.corpus import stopwords from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer from nltk.stem import WordNetLemmatizer
from .regex import WHITESPACE_RE, PUNCTUATION_RE from .regex import WHITESPACE_RE, PUNCTUATION_RE, LINK_RE
get_text = etree.XPath("//text()") get_text = etree.XPath("//text()")
stop_words_en = set(stopwords.words("english")) stop_words_en = set(stopwords.words("english"))
extra_stop_words_en = [
"u", "&", "-", "--"
]
stop_words_en.update(extra_stop_words_en)
nltk.download("stopwords", quiet=True) nltk.download("stopwords", quiet=True)
nltk.download("wordnet", quiet=True) nltk.download("wordnet", quiet=True)
lemmatizer = WordNetLemmatizer() lemmatizer = WordNetLemmatizer()
def clean(text, compress_whitespace=False, lowercase=False, clean_html=False, strip=False, remove_punctuation=False, def clean_multicore(texts, processes, **kwargs):
remove_stopwords_en=False, lemmatize=False): pool = Pool(processes=processes)
if compress_whitespace and remove_stopwords_en: return pool.map(
raise ValueError("Redundant flags: remove_stopwords implies compress_whitespace") func=partial(clean, **kwargs),
iterable=texts,
)
def clean(text, lowercase=False, clean_html=False, strip=False, remove_punctuation=False,
remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False,
remove_urls=False):
if fix_single_quotes:
text = text.replace("`", "'")
if remove_urls:
text = LINK_RE.sub(" ", text)
if clean_html: if clean_html:
try: try:
root = etree.fromstring(text) root = etree.fromstring(text.replace("&", ""))
text = "".join(get_text(root)) text = "".join(get_text(root))
except: except:
pass pass
if remove_punctuation:
text = PUNCTUATION_RE.sub(" ", text)
if lowercase: if lowercase:
text = text.lower() text = text.lower()
if compress_whitespace: if not remove_stopwords_en or not lemmatize or not strip_quotes:
text = WHITESPACE_RE.sub(" ", text) text = WHITESPACE_RE.sub(" ", text)
if strip: if strip_quotes:
text = text.strip() words = WHITESPACE_RE.split(text)
text = " ".join(w.strip("\"'") for w in words)
if remove_punctuation:
text = PUNCTUATION_RE.sub("", text)
if remove_stopwords_en or lemmatize: if remove_stopwords_en or lemmatize:
words = WHITESPACE_RE.split(text) words = WHITESPACE_RE.split(text)
@ -49,4 +71,7 @@ def clean(text, compress_whitespace=False, lowercase=False, clean_html=False, st
elif lemmatize and not remove_stopwords_en: elif lemmatize and not remove_stopwords_en:
text = " ".join(lemmatizer.lemmatize(w) for w in words) text = " ".join(lemmatizer.lemmatize(w) for w in words)
if strip:
text = text.strip()
return text return text

View File

@ -42,7 +42,6 @@ class TestText(TestCase):
text, text,
clean_html=True, clean_html=True,
lowercase=True, lowercase=True,
compress_whitespace=True
) )
expected = " hello, world " expected = " hello, world "
@ -54,7 +53,6 @@ class TestText(TestCase):
text, text,
clean_html=True, clean_html=True,
lowercase=True, lowercase=True,
compress_whitespace=True,
strip=True strip=True
) )
expected = "hello, world" expected = "hello, world"
@ -67,7 +65,6 @@ class TestText(TestCase):
text, text,
clean_html=True, clean_html=True,
lowercase=True, lowercase=True,
compress_whitespace=True,
strip=True, strip=True,
remove_punctuation=True remove_punctuation=True
) )
@ -103,3 +100,66 @@ class TestText(TestCase):
expected = "hello world" expected = "hello world"
self.assertEqual(cleaned, expected) self.assertEqual(cleaned, expected)
def test_html_8(self):
text = "<div>\n Hello, \t\n<strong>a the worlds! </strong>\n\t</div>"
cleaned = clean(
text,
clean_html=True,
lowercase=True,
remove_punctuation=True,
strip=True,
remove_stopwords_en=True,
lemmatize=True
)
expected = "hello world"
self.assertEqual(cleaned, expected)
def test_html_9(self):
text = "<div>\n Hello, \t\n<strong>world! it's it`s </strong>\n\t</div>"
cleaned = clean(
text,
clean_html=True,
lowercase=True,
remove_punctuation=True,
strip=True,
lemmatize=True,
fix_single_quotes=True
)
expected = "hello world it's it's"
self.assertEqual(cleaned, expected)
def test_html_10(self):
text = "<div>\n Hello, \t\n<strong>world! it's it`s https://google.ca/test/abc.pdf </strong>\n\t</div>"
cleaned = clean(
text,
clean_html=True,
lowercase=True,
remove_punctuation=True,
strip=True,
lemmatize=True,
fix_single_quotes=True,
remove_urls=True
)
expected = "hello world it's it's"
self.assertEqual(cleaned, expected)
def test_html_11(self):
text = "<div>\n Hello, \t\n<strong>world! it's it`s u & | </strong>\n\t</div>"
cleaned = clean(
text,
clean_html=True,
lowercase=True,
remove_punctuation=True,
strip=True,
lemmatize=True,
fix_single_quotes=True,
remove_stopwords_en=True,
remove_urls=True
)
expected = "hello world"
self.assertEqual(cleaned, expected)