improve text cleaning

This commit is contained in:
simon987 2021-04-18 18:50:39 -04:00
parent 45b5803c40
commit 00323ea576
2 changed files with 4 additions and 4 deletions

View File

@ -3,4 +3,4 @@ import re
LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))")
HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"")
WHITESPACE_RE = re.compile(r"\s+")
PUNCTUATION_RE = re.compile(r"[.,;:\"!?]+")
PUNCTUATION_RE = re.compile(r"[.,;:\"!?/]+")

View File

@ -50,9 +50,6 @@ def clean(text, compress_whitespace=False, lowercase=False, clean_html=False, st
if compress_whitespace:
text = WHITESPACE_RE.sub(" ", text)
if strip:
text = text.strip()
if remove_stopwords_en or lemmatize:
words = WHITESPACE_RE.split(text)
@ -63,4 +60,7 @@ def clean(text, compress_whitespace=False, lowercase=False, clean_html=False, st
elif lemmatize and not remove_stopwords_en:
text = " ".join(lemmatizer.lemmatize(w) for w in words)
if strip:
text = text.strip()
return text