improve text cleaning

2025-04-24 12:05:50 +00:00 · 2021-04-18 18:50:39 -04:00 · 2021-04-18 18:50:39 -04:00 · 00323ea576
commit 00323ea576
parent 45b5803c40
2 changed files with 4 additions and 4 deletions
--- a/hexlib/regex.py
+++ b/hexlib/regex.py
@ -3,4 +3,4 @@ import re
 LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))")
 HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"")
 WHITESPACE_RE = re.compile(r"\s+")
-PUNCTUATION_RE = re.compile(r"[.,;:\"!?]+")
+PUNCTUATION_RE = re.compile(r"[.,;:\"!?/]+")
--- a/hexlib/text.py
+++ b/hexlib/text.py
@ -50,9 +50,6 @@ def clean(text, compress_whitespace=False, lowercase=False, clean_html=False, st
    if compress_whitespace:
        text = WHITESPACE_RE.sub(" ", text)

-    if strip:
-        text = text.strip()
-
    if remove_stopwords_en or lemmatize:
        words = WHITESPACE_RE.split(text)

@ -63,4 +60,7 @@ def clean(text, compress_whitespace=False, lowercase=False, clean_html=False, st
        elif lemmatize and not remove_stopwords_en:
            text = " ".join(lemmatizer.lemmatize(w) for w in words)

+    if strip:
+        text = text.strip()
+
    return text