diff --git a/hexlib/regex.py b/hexlib/regex.py index 9d79dc0..861b403 100644 --- a/hexlib/regex.py +++ b/hexlib/regex.py @@ -3,4 +3,4 @@ import re LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))") HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"") WHITESPACE_RE = re.compile(r"\s+") -PUNCTUATION_RE = re.compile(r"[.,;:\"!?]+") +PUNCTUATION_RE = re.compile(r"[.,;:\"!?/]+") diff --git a/hexlib/text.py b/hexlib/text.py index 6851217..7629163 100644 --- a/hexlib/text.py +++ b/hexlib/text.py @@ -50,9 +50,6 @@ def clean(text, compress_whitespace=False, lowercase=False, clean_html=False, st if compress_whitespace: text = WHITESPACE_RE.sub(" ", text) - if strip: - text = text.strip() - if remove_stopwords_en or lemmatize: words = WHITESPACE_RE.split(text) @@ -63,4 +60,7 @@ def clean(text, compress_whitespace=False, lowercase=False, clean_html=False, st elif lemmatize and not remove_stopwords_en: text = " ".join(lemmatizer.lemmatize(w) for w in words) + if strip: + text = text.strip() + return text