From 00323ea5763a7c55978ba1b69d6be00afd174b70 Mon Sep 17 00:00:00 2001 From: simon987 Date: Sun, 18 Apr 2021 18:50:39 -0400 Subject: [PATCH] improve text cleaning --- hexlib/regex.py | 2 +- hexlib/text.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/hexlib/regex.py b/hexlib/regex.py index 9d79dc0..861b403 100644 --- a/hexlib/regex.py +++ b/hexlib/regex.py @@ -3,4 +3,4 @@ import re LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))") HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"") WHITESPACE_RE = re.compile(r"\s+") -PUNCTUATION_RE = re.compile(r"[.,;:\"!?]+") +PUNCTUATION_RE = re.compile(r"[.,;:\"!?/]+") diff --git a/hexlib/text.py b/hexlib/text.py index 6851217..7629163 100644 --- a/hexlib/text.py +++ b/hexlib/text.py @@ -50,9 +50,6 @@ def clean(text, compress_whitespace=False, lowercase=False, clean_html=False, st if compress_whitespace: text = WHITESPACE_RE.sub(" ", text) - if strip: - text = text.strip() - if remove_stopwords_en or lemmatize: words = WHITESPACE_RE.split(text) @@ -63,4 +60,7 @@ def clean(text, compress_whitespace=False, lowercase=False, clean_html=False, st elif lemmatize and not remove_stopwords_en: text = " ".join(lemmatizer.lemmatize(w) for w in words) + if strip: + text = text.strip() + return text