Fix strip_quotes

This commit is contained in:
simon987 2021-11-16 11:48:23 -05:00
parent 3677815d57
commit 55fd4a66d2
3 changed files with 12 additions and 2 deletions

View File

@ -97,7 +97,7 @@ def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False
words = text.split() words = text.split()
if strip_quotes: if strip_quotes:
words = filter(lambda w: w.strip("\"'"), words) words = map(lambda w: w.strip("\"'"), words)
if bigrams: if bigrams:
words = _transform_bigram(nltk.bigrams(chain(words, ("*",))), bigrams) words = _transform_bigram(nltk.bigrams(chain(words, ("*",))), bigrams)

View File

@ -2,7 +2,7 @@ from setuptools import setup
setup( setup(
name="hexlib", name="hexlib",
version="1.68", version="1.69",
description="Misc utility methods", description="Misc utility methods",
author="simon987", author="simon987",
author_email="me@simon987.net", author_email="me@simon987.net",

View File

@ -247,3 +247,13 @@ class TestText(TestCase):
expected = "hello1 test1124test world" expected = "hello1 test1124test world"
self.assertEqual(" ".join(cleaned), expected) self.assertEqual(" ".join(cleaned), expected)
def test_strip_quotes(self):
text = "'hi' “test” 'hello\""
cleaned = preprocess(
text,
strip_quotes=True
)
expected = "hi test hello"
self.assertEqual(" ".join(cleaned), expected)