From c9fac7151a5d946a6effa80581bb8a902d8dd786 Mon Sep 17 00:00:00 2001 From: simon987 Date: Wed, 23 Feb 2022 11:01:17 -0500 Subject: [PATCH] Split punctuation into punctuation and special_punctuation --- hexlib/text.py | 14 ++++++++++---- setup.py | 2 +- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/hexlib/text.py b/hexlib/text.py index 49aa5b8..6877d1f 100644 --- a/hexlib/text.py +++ b/hexlib/text.py @@ -53,13 +53,16 @@ SINGLE_QUOTE_TRANS = str.maketrans("".join(SINGLE_QUOTES), "".join(repeat("'", l DASHES = ("–", "⸺", "–", "—") DASHES_TRANS = str.maketrans("".join(DASHES), "".join(repeat("-", len(DASHES)))) -PUNCTUATION = ".,;:\"!?/()|*=>" +SPECIAL_PUNCTUATION = ";:\"/()|*=>" +SPECIAL_PUNCTUATION_TRANS = str.maketrans(SPECIAL_PUNCTUATION, " " * len(SPECIAL_PUNCTUATION)) + +PUNCTUATION = ".,!?" PUNCTUATION_TRANS = str.maketrans(PUNCTUATION, " " * len(PUNCTUATION)) -def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False, remove_stopwords_en=False, - lemmatize=False, fix_single_quotes=False, strip_quotes=False, remove_urls=False, bigrams: set = None, - trigrams: set = None, remove_numbers=False): +def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False, remove_special_punctuation=False, + remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False, + remove_urls=False, bigrams: set = None, trigrams: set = None, remove_numbers=False): if lowercase: text = text.lower() @@ -85,6 +88,9 @@ def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False if remove_punctuation: text = text.translate(PUNCTUATION_TRANS) + if remove_special_punctuation: + text = text.translate(SPECIAL_PUNCTUATION_TRANS) + words = text.split() if strip_quotes: diff --git a/setup.py b/setup.py index 3c435a6..6bc0237 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup setup( name="hexlib", - version="1.75", + version="1.76", description="Misc utility methods", author="simon987", author_email="me@simon987.net",