1
0
mirror of https://github.com/simon987/hexlib.git synced 2025-04-08 21:26:42 +00:00

Split punctuation into punctuation and special_punctuation

This commit is contained in:
simon987 2022-02-23 11:01:17 -05:00
parent 084acbe184
commit c9fac7151a
2 changed files with 11 additions and 5 deletions

@ -53,13 +53,16 @@ SINGLE_QUOTE_TRANS = str.maketrans("".join(SINGLE_QUOTES), "".join(repeat("'", l
DASHES = ("", "", "", "")
DASHES_TRANS = str.maketrans("".join(DASHES), "".join(repeat("-", len(DASHES))))
PUNCTUATION = ".,;:\"!?/()|*=>"
SPECIAL_PUNCTUATION = ";:\"/()|*=>"
SPECIAL_PUNCTUATION_TRANS = str.maketrans(SPECIAL_PUNCTUATION, " " * len(SPECIAL_PUNCTUATION))
PUNCTUATION = ".,!?"
PUNCTUATION_TRANS = str.maketrans(PUNCTUATION, " " * len(PUNCTUATION))
def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False, remove_stopwords_en=False,
lemmatize=False, fix_single_quotes=False, strip_quotes=False, remove_urls=False, bigrams: set = None,
trigrams: set = None, remove_numbers=False):
def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False, remove_special_punctuation=False,
remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False,
remove_urls=False, bigrams: set = None, trigrams: set = None, remove_numbers=False):
if lowercase:
text = text.lower()
@ -85,6 +88,9 @@ def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False
if remove_punctuation:
text = text.translate(PUNCTUATION_TRANS)
if remove_special_punctuation:
text = text.translate(SPECIAL_PUNCTUATION_TRANS)
words = text.split()
if strip_quotes:

@ -2,7 +2,7 @@ from setuptools import setup
setup(
name="hexlib",
version="1.75",
version="1.76",
description="Misc utility methods",
author="simon987",
author_email="me@simon987.net",