From 45b5803c40a577c6824edd6f5a4332c8aa848375 Mon Sep 17 00:00:00 2001
From: simon987 <me@simon987.net>
Date: Sun, 18 Apr 2021 15:40:30 -0400
Subject: [PATCH] improve text cleaning

---
 hexlib/regex.py   |  2 +-
 hexlib/text.py    | 22 ++++++++++++++++++----
 test/test_text.py | 30 ++++++++++++++++++++++++++++++
 3 files changed, 49 insertions(+), 5 deletions(-)
diff --git a/hexlib/regex.py b/hexlib/regex.py
index 348e023..9d79dc0 100644
--- a/hexlib/regex.py
+++ b/hexlib/regex.py
@@ -3,4 +3,4 @@ import re
 LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))")
 HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"")
 WHITESPACE_RE = re.compile(r"\s+")
-PUNCTUATION_RE = re.compile(r"[.,;:\"']+")
+PUNCTUATION_RE = re.compile(r"[.,;:\"!?]+")
diff --git a/hexlib/text.py b/hexlib/text.py
index 0b226b3..6851217 100644
--- a/hexlib/text.py
+++ b/hexlib/text.py
@@ -1,3 +1,6 @@
+from functools import partial
+from multiprocessing.pool import ThreadPool
+
 import nltk.corpus
 from lxml import etree
 from nltk.corpus import stopwords
@@ -15,11 +18,22 @@ nltk.download("wordnet", quiet=True)
 lemmatizer = WordNetLemmatizer()
 
 
+def clean_multithread(texts, processes, **kwargs):
+    pool = ThreadPool(processes=processes)
+    return pool.map(
+        func=partial(clean, **kwargs),
+        iterable=texts,
+    )
+
+
 def clean(text, compress_whitespace=False, lowercase=False, clean_html=False, strip=False, remove_punctuation=False,
-          remove_stopwords_en=False, lemmatize=False):
+          remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False):
     if compress_whitespace and remove_stopwords_en:
         raise ValueError("Redundant flags: remove_stopwords implies compress_whitespace")
 
+    if fix_single_quotes:
+        text = text.replace("`", "'")
+
     if clean_html:
         try:
             root = etree.fromstring(text)
@@ -27,6 +41,9 @@ def clean(text, compress_whitespace=False, lowercase=False, clean_html=False, st
         except:
             pass
 
+    if remove_punctuation:
+        text = PUNCTUATION_RE.sub(" ", text)
+
     if lowercase:
         text = text.lower()
 
@@ -36,9 +53,6 @@ def clean(text, compress_whitespace=False, lowercase=False, clean_html=False, st
     if strip:
         text = text.strip()
 
-    if remove_punctuation:
-        text = PUNCTUATION_RE.sub("", text)
-
     if remove_stopwords_en or lemmatize:
         words = WHITESPACE_RE.split(text)
 
diff --git a/test/test_text.py b/test/test_text.py
index abd1ef6..18eeb1c 100644
--- a/test/test_text.py
+++ b/test/test_text.py
@@ -103,3 +103,33 @@ class TestText(TestCase):
         expected = "hello world"
 
         self.assertEqual(cleaned, expected)
+
+    def test_html_8(self):
+        text = "<div>\n Hello, \t\n<strong>a the worlds!    </strong>\n\t</div>"
+        cleaned = clean(
+            text,
+            clean_html=True,
+            lowercase=True,
+            remove_punctuation=True,
+            strip=True,
+            remove_stopwords_en=True,
+            lemmatize=True
+        )
+        expected = "hello world"
+
+        self.assertEqual(cleaned, expected)
+
+    def test_html_9(self):
+        text = "<div>\n Hello, \t\n<strong>world! it's it`s   </strong>\n\t</div>"
+        cleaned = clean(
+            text,
+            clean_html=True,
+            lowercase=True,
+            remove_punctuation=True,
+            strip=True,
+            lemmatize=True,
+            fix_single_quotes=True
+        )
+        expected = "hello world it's it's"
+
+        self.assertEqual(cleaned, expected)