diff --git a/hexlib/text.py b/hexlib/text.py index d6e5a22..4172103 100644 --- a/hexlib/text.py +++ b/hexlib/text.py @@ -3,6 +3,7 @@ from itertools import chain, repeat import nltk.corpus from lxml import etree +from nltk import word_tokenize from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer @@ -12,6 +13,7 @@ get_text = etree.XPath("//text()") nltk.download("stopwords", quiet=True) nltk.download("wordnet", quiet=True) +nltk.download("punkt", quiet=True) stop_words_en = set(stopwords.words("english")) @@ -64,7 +66,8 @@ PUNCTUATION_TRANS = str.maketrans(PUNCTUATION, " " * len(PUNCTUATION)) def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False, remove_special_punctuation=False, remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False, strip_dashes=False, - remove_urls=False, bigrams: set = None, trigrams: set = None, remove_numbers=False): + remove_urls=False, bigrams: set = None, trigrams: set = None, remove_numbers=False, + use_nltk_tokenizer=False): if lowercase: text = text.lower() @@ -96,7 +99,10 @@ def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False if remove_special_punctuation: text = text.translate(SPECIAL_PUNCTUATION_TRANS) - words = text.split() + if use_nltk_tokenizer: + words = word_tokenize(text, language="english") + else: + words = text.split() if strip_quotes: words = map(lambda w: w.strip("\"'“”"), words) diff --git a/setup.py b/setup.py index 7b9d485..c9aad1e 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup setup( name="hexlib", - version="1.88", + version="1.89", description="Misc utility methods", author="simon987", author_email="me@simon987.net", diff --git a/test/test_text.py b/test/test_text.py index e83bea5..1a30018 100644 --- a/test/test_text.py +++ b/test/test_text.py @@ -267,3 +267,13 @@ class TestText(TestCase): expected = "yes But something-something hello aa-bb" self.assertEqual(" ".join(cleaned), expected) + + def test_word_tokenize(self): + text = "i cannot believe'" + cleaned = preprocess( + text, + use_nltk_tokenizer=True + ) + expected = "i can not believe '" + + self.assertEqual(" ".join(cleaned), expected)