add bigram option for clean function

add retries arg in get_web()
2025-10-29 23:46:52 +00:00 · 2021-04-21 21:34:49 -04:00 · 2021-04-21 19:50:59 -04:00
4 changed files with 37 additions and 6 deletions
--- a/hexlib/env.py
+++ b/hexlib/env.py
@ -45,6 +45,7 @@ def get_web(session=None):
        logger=stdout_logger,
        cookie_file=os.environ.get("COOKIE_FILE", None),
        retry_codes=set(int(x) if x else None for x in os.environ.get("RETRY_CODES", "").split(",")),
        retries=int(os.environ.get("RETRIES", 3)),
        retry_sleep=int(os.environ.get("RETRY_SLEEP", 0)),
        ua=ua[os.environ.get("USER_AGENT")] if os.environ.get("USER_AGENT", None) is not None else None
    )
--- a/hexlib/text.py
+++ b/hexlib/text.py
@ -32,9 +32,22 @@ def clean_multicore(texts, processes, **kwargs):
    )
 def _transform_bigram(ngram_seq, ngrams):
    for ngram in ngram_seq:
        if ngram in ngrams:
            yield "_".join(ngram)
            ngram_seq.__next__()
        else:
            yield ngram[0]
 def clean(text, lowercase=False, clean_html=False, strip=False, remove_punctuation=False,
          remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False,
-          remove_urls=False):
+          remove_urls=False, bigrams: set = None):
    if lowercase:
        text = text.lower()
    if fix_single_quotes:
        text = text.replace("`", "'")
@ -51,9 +64,6 @@ def clean(text, lowercase=False, clean_html=False, strip=False, remove_punctuati
    if remove_punctuation:
        text = PUNCTUATION_RE.sub(" ", text)
    if lowercase:
        text = text.lower()
    if not remove_stopwords_en or not lemmatize or not strip_quotes:
        text = WHITESPACE_RE.sub(" ", text)
@ -61,6 +71,11 @@ def clean(text, lowercase=False, clean_html=False, strip=False, remove_punctuati
        words = WHITESPACE_RE.split(text)
        text = " ".join(w.strip("\"'") for w in words)
    if bigrams:
        words = WHITESPACE_RE.split(text)
        words.append("*")
        text = " ".join(_transform_bigram(nltk.bigrams(words), bigrams))
    if remove_stopwords_en or lemmatize:
        words = WHITESPACE_RE.split(text)
--- a/setup.py
+++ b/setup.py
@ -2,7 +2,7 @@ from setuptools import setup
 setup(
    name="hexlib",
-    version="1.40",
+    version="1.41",
    description="Misc utility methods",
    author="simon987",
    author_email="me@simon987.net",
--- a/test/test_text.py
+++ b/test/test_text.py
@ -148,7 +148,7 @@ class TestText(TestCase):
        self.assertEqual(cleaned, expected)
    def test_html_11(self):
-        text = "<div>\n Hello, \t\n<strong>world! it's it`s u & | </strong>\n\t</div>"
+        text = "<div>\n Hello, \t\n<strong>world! it's it`s u us & | </strong>\n\t</div>"
        cleaned = clean(
            text,
            clean_html=True,
@ -163,3 +163,18 @@ class TestText(TestCase):
        expected = "hello world"
        self.assertEqual(cleaned, expected)
    def test_bigrams(self):
        text = "x A b c d e f g h"
        cleaned = clean(
            text,
            lowercase=True,
            bigrams={
                ("a", "b"),
                ("c", "d"),
                ("f", "g"),
            }
        )
        expected = "x a_b c_d e f_g h"
        self.assertEqual(cleaned, expected)
Author	SHA1	Message	Date
simon987	88f3124f85	add bigram option for clean function	2021-04-21 21:34:49 -04:00
simon987	8edad0255b	add retries arg in get_web()	2021-04-21 19:50:59 -04:00