Fix strip_quotes

This commit is contained in:
simon987 2021-11-16 11:48:23 -05:00
parent 3677815d57
commit 55fd4a66d2
3 changed files with 12 additions and 2 deletions

View File

@ -97,7 +97,7 @@ def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False
words = text.split()
if strip_quotes:
words = filter(lambda w: w.strip("\"'"), words)
words = map(lambda w: w.strip("\"'"), words)
if bigrams:
words = _transform_bigram(nltk.bigrams(chain(words, ("*",))), bigrams)

View File

@ -2,7 +2,7 @@ from setuptools import setup
setup(
name="hexlib",
version="1.68",
version="1.69",
description="Misc utility methods",
author="simon987",
author_email="me@simon987.net",

View File

@ -247,3 +247,13 @@ class TestText(TestCase):
expected = "hello1 test1124test world"
self.assertEqual(" ".join(cleaned), expected)
def test_strip_quotes(self):
text = "'hi' “test” 'hello\""
cleaned = preprocess(
text,
strip_quotes=True
)
expected = "hi test hello"
self.assertEqual(" ".join(cleaned), expected)