normalize dashes in preprocess

This commit is contained in:
simon987 2021-11-16 11:34:48 -05:00
parent 8d8f9e8751
commit e1537297d7
2 changed files with 6 additions and 1 deletions

View File

@ -59,6 +59,9 @@ def _transform_trigram(ngram_seq, ngrams):
SINGLE_QUOTES = ("", "`")
SINGLE_QUOTE_TRANS = str.maketrans("".join(SINGLE_QUOTES), "".join(repeat("'", len(SINGLE_QUOTES))))
DASHES = ("", "", "")
DASHES_TRANS = str.maketrans("".join(DASHES), "".join(repeat("-", len(DASHES))))
PUNCTUATION = ".,;:\"!?/()|*=>"
PUNCTUATION_TRANS = str.maketrans(PUNCTUATION, " " * len(PUNCTUATION))
@ -72,6 +75,8 @@ def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False
if fix_single_quotes:
text = text.translate(SINGLE_QUOTE_TRANS)
text = text.translate(DASHES_TRANS)
if remove_urls:
text = LINK_RE.sub(" ", text)

View File

@ -2,7 +2,7 @@ from setuptools import setup
setup(
name="hexlib",
version="1.66",
version="1.67",
description="Misc utility methods",
author="simon987",
author_email="me@simon987.net",