diff --git a/hexlib/text.py b/hexlib/text.py index 86009ba..7cb7e76 100644 --- a/hexlib/text.py +++ b/hexlib/text.py @@ -59,6 +59,9 @@ def _transform_trigram(ngram_seq, ngrams): SINGLE_QUOTES = ("’", "`") SINGLE_QUOTE_TRANS = str.maketrans("".join(SINGLE_QUOTES), "".join(repeat("'", len(SINGLE_QUOTES)))) +DASHES = ("–", "⸺", "–") +DASHES_TRANS = str.maketrans("".join(DASHES), "".join(repeat("-", len(DASHES)))) + PUNCTUATION = ".,;:\"!?/()|*=>" PUNCTUATION_TRANS = str.maketrans(PUNCTUATION, " " * len(PUNCTUATION)) @@ -72,6 +75,8 @@ def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False if fix_single_quotes: text = text.translate(SINGLE_QUOTE_TRANS) + text = text.translate(DASHES_TRANS) + if remove_urls: text = LINK_RE.sub(" ", text) diff --git a/setup.py b/setup.py index fc520ce..1de8b95 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup setup( name="hexlib", - version="1.66", + version="1.67", description="Misc utility methods", author="simon987", author_email="me@simon987.net",