mirror of
https://github.com/simon987/hexlib.git
synced 2025-04-04 02:12:59 +00:00
Fix strip_quotes
This commit is contained in:
parent
3677815d57
commit
55fd4a66d2
@ -97,7 +97,7 @@ def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False
|
||||
words = text.split()
|
||||
|
||||
if strip_quotes:
|
||||
words = filter(lambda w: w.strip("\"'“"), words)
|
||||
words = map(lambda w: w.strip("\"'“”"), words)
|
||||
|
||||
if bigrams:
|
||||
words = _transform_bigram(nltk.bigrams(chain(words, ("*",))), bigrams)
|
||||
|
2
setup.py
2
setup.py
@ -2,7 +2,7 @@ from setuptools import setup
|
||||
|
||||
setup(
|
||||
name="hexlib",
|
||||
version="1.68",
|
||||
version="1.69",
|
||||
description="Misc utility methods",
|
||||
author="simon987",
|
||||
author_email="me@simon987.net",
|
||||
|
@ -247,3 +247,13 @@ class TestText(TestCase):
|
||||
expected = "hello1 test1124test world"
|
||||
|
||||
self.assertEqual(" ".join(cleaned), expected)
|
||||
|
||||
def test_strip_quotes(self):
|
||||
text = "'hi' “test” 'hello\""
|
||||
cleaned = preprocess(
|
||||
text,
|
||||
strip_quotes=True
|
||||
)
|
||||
expected = "hi test hello"
|
||||
|
||||
self.assertEqual(" ".join(cleaned), expected)
|
||||
|
Loading…
x
Reference in New Issue
Block a user