Add option to use nltk word_tokenize

This commit is contained in:
simon987 2023-09-09 11:11:44 -04:00
parent a047366926
commit b1a1da3bac
3 changed files with 19 additions and 3 deletions

View File

@ -3,6 +3,7 @@ from itertools import chain, repeat
import nltk.corpus import nltk.corpus
from lxml import etree from lxml import etree
from nltk import word_tokenize
from nltk.corpus import stopwords from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer from nltk.stem import WordNetLemmatizer
@ -12,6 +13,7 @@ get_text = etree.XPath("//text()")
nltk.download("stopwords", quiet=True) nltk.download("stopwords", quiet=True)
nltk.download("wordnet", quiet=True) nltk.download("wordnet", quiet=True)
nltk.download("punkt", quiet=True)
stop_words_en = set(stopwords.words("english")) stop_words_en = set(stopwords.words("english"))
@ -64,7 +66,8 @@ PUNCTUATION_TRANS = str.maketrans(PUNCTUATION, " " * len(PUNCTUATION))
def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False, remove_special_punctuation=False, def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False, remove_special_punctuation=False,
remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False, remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False,
strip_dashes=False, strip_dashes=False,
remove_urls=False, bigrams: set = None, trigrams: set = None, remove_numbers=False): remove_urls=False, bigrams: set = None, trigrams: set = None, remove_numbers=False,
use_nltk_tokenizer=False):
if lowercase: if lowercase:
text = text.lower() text = text.lower()
@ -96,7 +99,10 @@ def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False
if remove_special_punctuation: if remove_special_punctuation:
text = text.translate(SPECIAL_PUNCTUATION_TRANS) text = text.translate(SPECIAL_PUNCTUATION_TRANS)
words = text.split() if use_nltk_tokenizer:
words = word_tokenize(text, language="english")
else:
words = text.split()
if strip_quotes: if strip_quotes:
words = map(lambda w: w.strip("\"'“”"), words) words = map(lambda w: w.strip("\"'“”"), words)

View File

@ -2,7 +2,7 @@ from setuptools import setup
setup( setup(
name="hexlib", name="hexlib",
version="1.88", version="1.89",
description="Misc utility methods", description="Misc utility methods",
author="simon987", author="simon987",
author_email="me@simon987.net", author_email="me@simon987.net",

View File

@ -267,3 +267,13 @@ class TestText(TestCase):
expected = "yes But something-something hello aa-bb" expected = "yes But something-something hello aa-bb"
self.assertEqual(" ".join(cleaned), expected) self.assertEqual(" ".join(cleaned), expected)
def test_word_tokenize(self):
text = "i cannot believe'"
cleaned = preprocess(
text,
use_nltk_tokenizer=True
)
expected = "i can not believe '"
self.assertEqual(" ".join(cleaned), expected)