From 4befc3973df975e949e402f35b833d60f460a45b Mon Sep 17 00:00:00 2001 From: simon987 Date: Sat, 26 Feb 2022 19:31:22 -0500 Subject: [PATCH] Add strip_dashes option in preprocess() --- hexlib/text.py | 10 ++++++++++ setup.py | 2 +- test/test_text.py | 14 ++++++++++++-- 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/hexlib/text.py b/hexlib/text.py index 6877d1f..b1400d7 100644 --- a/hexlib/text.py +++ b/hexlib/text.py @@ -1,3 +1,4 @@ +import re from functools import partial from itertools import chain, repeat from multiprocessing.pool import Pool @@ -53,6 +54,8 @@ SINGLE_QUOTE_TRANS = str.maketrans("".join(SINGLE_QUOTES), "".join(repeat("'", l DASHES = ("–", "⸺", "–", "—") DASHES_TRANS = str.maketrans("".join(DASHES), "".join(repeat("-", len(DASHES)))) +DASHES_RE = re.compile(r"-+") + SPECIAL_PUNCTUATION = ";:\"/()|*=>" SPECIAL_PUNCTUATION_TRANS = str.maketrans(SPECIAL_PUNCTUATION, " " * len(SPECIAL_PUNCTUATION)) @@ -62,6 +65,7 @@ PUNCTUATION_TRANS = str.maketrans(PUNCTUATION, " " * len(PUNCTUATION)) def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False, remove_special_punctuation=False, remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False, + strip_dashes=False, remove_urls=False, bigrams: set = None, trigrams: set = None, remove_numbers=False): if lowercase: text = text.lower() @@ -71,6 +75,9 @@ def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False text = text.translate(DASHES_TRANS) + if strip_dashes: + text = DASHES_RE.sub("-", text) + if remove_urls: text = LINK_RE.sub(" ", text) @@ -96,6 +103,9 @@ def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False if strip_quotes: words = map(lambda w: w.strip("\"'“”"), words) + if strip_dashes: + words = map(lambda w: w.strip("-"), words) + if bigrams: words = _transform_bigram(nltk.bigrams(chain(words, ("*",))), bigrams) diff --git a/setup.py b/setup.py index 6bc0237..c55edaa 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup setup( name="hexlib", - version="1.76", + version="1.77", description="Misc utility methods", author="simon987", author_email="me@simon987.net", diff --git a/test/test_text.py b/test/test_text.py index 3d6f7ce..e83bea5 100644 --- a/test/test_text.py +++ b/test/test_text.py @@ -152,7 +152,7 @@ class TestText(TestCase): remove_stopwords_en=True, remove_urls=True ) - expected = "hello world" + expected = "hello world |" self.assertEqual(" ".join(cleaned), expected) @@ -170,7 +170,7 @@ class TestText(TestCase): remove_urls=False ) - expected = "217709510 is there a servant that is against civilization and humanity literally instant summon" + expected = ">>217709510 is there a servant that is against civilization and humanity literally instant summon" self.assertEqual(" ".join(cleaned), expected) def test_html_entity(self): @@ -257,3 +257,13 @@ class TestText(TestCase): expected = "hi test hello" self.assertEqual(" ".join(cleaned), expected) + + def test_strip_dashes(self): + text = "yes -But something-something -- hello aa--bb" + cleaned = preprocess( + text, + strip_dashes=True + ) + expected = "yes But something-something hello aa-bb" + + self.assertEqual(" ".join(cleaned), expected)