From 88f3124f85050a03bce7f53025c5b775fe8f94e5 Mon Sep 17 00:00:00 2001
From: simon987 <me@simon987.net>
Date: Wed, 21 Apr 2021 21:34:49 -0400
Subject: [PATCH] add bigram option for clean function

---
 hexlib/text.py    | 23 +++++++++++++++++++----
 test/test_text.py | 17 ++++++++++++++++-
 2 files changed, 35 insertions(+), 5 deletions(-)
diff --git a/hexlib/text.py b/hexlib/text.py
index 97847aa..b781782 100644
--- a/hexlib/text.py
+++ b/hexlib/text.py
@@ -32,9 +32,22 @@ def clean_multicore(texts, processes, **kwargs):
     )
 
 
+def _transform_bigram(ngram_seq, ngrams):
+    for ngram in ngram_seq:
+        if ngram in ngrams:
+            yield "_".join(ngram)
+
+            ngram_seq.__next__()
+        else:
+            yield ngram[0]
+
+
 def clean(text, lowercase=False, clean_html=False, strip=False, remove_punctuation=False,
           remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False,
-          remove_urls=False):
+          remove_urls=False, bigrams: set = None):
+    if lowercase:
+        text = text.lower()
+
     if fix_single_quotes:
         text = text.replace("`", "'")
 
@@ -51,9 +64,6 @@ def clean(text, lowercase=False, clean_html=False, strip=False, remove_punctuati
     if remove_punctuation:
         text = PUNCTUATION_RE.sub(" ", text)
 
-    if lowercase:
-        text = text.lower()
-
     if not remove_stopwords_en or not lemmatize or not strip_quotes:
         text = WHITESPACE_RE.sub(" ", text)
 
@@ -61,6 +71,11 @@ def clean(text, lowercase=False, clean_html=False, strip=False, remove_punctuati
         words = WHITESPACE_RE.split(text)
         text = " ".join(w.strip("\"'") for w in words)
 
+    if bigrams:
+        words = WHITESPACE_RE.split(text)
+        words.append("*")
+        text = " ".join(_transform_bigram(nltk.bigrams(words), bigrams))
+
     if remove_stopwords_en or lemmatize:
         words = WHITESPACE_RE.split(text)
 
diff --git a/test/test_text.py b/test/test_text.py
index ed9c9a6..334f456 100644
--- a/test/test_text.py
+++ b/test/test_text.py
@@ -148,7 +148,7 @@ class TestText(TestCase):
         self.assertEqual(cleaned, expected)
 
     def test_html_11(self):
-        text = "<div>\n Hello, \t\n<strong>world! it's it`s u & | </strong>\n\t</div>"
+        text = "<div>\n Hello, \t\n<strong>world! it's it`s u us & | </strong>\n\t</div>"
         cleaned = clean(
             text,
             clean_html=True,
@@ -163,3 +163,18 @@ class TestText(TestCase):
         expected = "hello world"
 
         self.assertEqual(cleaned, expected)
+
+    def test_bigrams(self):
+        text = "x A b c d e f g h"
+        cleaned = clean(
+            text,
+            lowercase=True,
+            bigrams={
+                ("a", "b"),
+                ("c", "d"),
+                ("f", "g"),
+            }
+        )
+        expected = "x a_b c_d e f_g h"
+
+        self.assertEqual(cleaned, expected)