From 7e0ffafb8c899f269160f88cf60e18356f43a170 Mon Sep 17 00:00:00 2001 From: simon987 Date: Sat, 28 Aug 2021 20:48:00 -0400 Subject: [PATCH] Update fix_single_quotes --- hexlib/regex.py | 2 +- hexlib/text.py | 1 + setup.py | 2 +- test/test_text.py | 11 +++++++++++ 4 files changed, 14 insertions(+), 2 deletions(-) diff --git a/hexlib/regex.py b/hexlib/regex.py index b579974..9c99f5f 100644 --- a/hexlib/regex.py +++ b/hexlib/regex.py @@ -3,5 +3,5 @@ import re LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))") HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"") WHITESPACE_RE = re.compile(r"\s+") -PUNCTUATION_RE = re.compile(r"[.,;:\"!?/()|*=>]+") +PUNCTUATION_RE = re.compile(r"[.,;:\"“!?/()|*=>]+") XML_ENTITY_RE = re.compile(r"&[a-z]+;") diff --git a/hexlib/text.py b/hexlib/text.py index d4916c0..af6d3b9 100644 --- a/hexlib/text.py +++ b/hexlib/text.py @@ -51,6 +51,7 @@ def preprocess(text, lowercase=False, clean_html=False, strip=False, remove_punc if fix_single_quotes: text = text.replace("`", "'") + text = text.replace("’", "'") if remove_urls: text = LINK_RE.sub(" ", text) diff --git a/setup.py b/setup.py index d4ad03b..9e62f82 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup setup( name="hexlib", - version="1.48", + version="1.49", description="Misc utility methods", author="simon987", author_email="me@simon987.net", diff --git a/test/test_text.py b/test/test_text.py index 5d698f0..503f628 100644 --- a/test/test_text.py +++ b/test/test_text.py @@ -130,6 +130,17 @@ class TestText(TestCase): expected = "hello world it's it's" self.assertEqual(cleaned, expected) + + def test_single_quote(self): + text = "it's it`s it’s" + cleaned = preprocess( + text, + lowercase=True, + fix_single_quotes=True + ) + expected = "it's it's it's" + + self.assertEqual(cleaned, expected) def test_html_10(self): text = "
\n Hello, \t\nworld! it's it`s https://google.ca/test/abc.pdf \n\t
"