diff --git a/hexlib/text.py b/hexlib/text.py index 4ff5904..bc660ee 100644 --- a/hexlib/text.py +++ b/hexlib/text.py @@ -97,7 +97,7 @@ def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False words = text.split() if strip_quotes: - words = filter(lambda w: w.strip("\"'“"), words) + words = map(lambda w: w.strip("\"'“”"), words) if bigrams: words = _transform_bigram(nltk.bigrams(chain(words, ("*",))), bigrams) diff --git a/setup.py b/setup.py index 9199c84..073c7ea 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup setup( name="hexlib", - version="1.68", + version="1.69", description="Misc utility methods", author="simon987", author_email="me@simon987.net", diff --git a/test/test_text.py b/test/test_text.py index e10e1bd..3d6f7ce 100644 --- a/test/test_text.py +++ b/test/test_text.py @@ -247,3 +247,13 @@ class TestText(TestCase): expected = "hello1 test1124test world" self.assertEqual(" ".join(cleaned), expected) + + def test_strip_quotes(self): + text = "'hi' “test” 'hello\"" + cleaned = preprocess( + text, + strip_quotes=True + ) + expected = "hi test hello" + + self.assertEqual(" ".join(cleaned), expected)