From 00323ea5763a7c55978ba1b69d6be00afd174b70 Mon Sep 17 00:00:00 2001
From: simon987 <me@simon987.net>
Date: Sun, 18 Apr 2021 18:50:39 -0400
Subject: [PATCH] improve text cleaning

---
 hexlib/regex.py | 2 +-
 hexlib/text.py  | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/hexlib/regex.py b/hexlib/regex.py
index 9d79dc0..861b403 100644
--- a/hexlib/regex.py
+++ b/hexlib/regex.py
@@ -3,4 +3,4 @@ import re
 LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))")
 HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"")
 WHITESPACE_RE = re.compile(r"\s+")
-PUNCTUATION_RE = re.compile(r"[.,;:\"!?]+")
+PUNCTUATION_RE = re.compile(r"[.,;:\"!?/]+")
diff --git a/hexlib/text.py b/hexlib/text.py
index 6851217..7629163 100644
--- a/hexlib/text.py
+++ b/hexlib/text.py
@@ -50,9 +50,6 @@ def clean(text, compress_whitespace=False, lowercase=False, clean_html=False, st
     if compress_whitespace:
         text = WHITESPACE_RE.sub(" ", text)
 
-    if strip:
-        text = text.strip()
-
     if remove_stopwords_en or lemmatize:
         words = WHITESPACE_RE.split(text)
 
@@ -63,4 +60,7 @@ def clean(text, compress_whitespace=False, lowercase=False, clean_html=False, st
         elif lemmatize and not remove_stopwords_en:
             text = " ".join(lemmatizer.lemmatize(w) for w in words)
 
+    if strip:
+        text = text.strip()
+
     return text