diff --git a/bench/text.py b/bench/text.py new file mode 100644 index 0000000..a9ce01c --- /dev/null +++ b/bench/text.py @@ -0,0 +1,10 @@ +from timeit import timeit + +if __name__ == '__main__': + + res = timeit( + setup="from hexlib.text import preprocess", + stmt='text = "x A b c d e f g h"\ncleaned = preprocess(\n text,\n lowercase=True,\n trigrams={\n ("a", "b", "c"),\n ("e", "f", "g"),\n }\n)' + ) + + print(res) \ No newline at end of file diff --git a/hexlib/files.py b/hexlib/files.py index 38b8ccf..d3e3205 100644 --- a/hexlib/files.py +++ b/hexlib/files.py @@ -62,6 +62,16 @@ COMPRESSION_GZIP = "gz" COMPRESSION_ZSTD = "zstd" +class NDJsonLine: + __slots__ = "text" + + def __init__(self, text): + self.text = text + + def json(self): + return json.loads(self.text) + + def ndjson_iter(*files, compression=""): for file in files: cleanup = None @@ -90,7 +100,6 @@ def ndjson_iter(*files, compression=""): line_iter.close() for line in line_iter: - yield json.loads(line) + yield NDJsonLine(line) if cleanup: cleanup() - diff --git a/hexlib/text.py b/hexlib/text.py index 77688f9..9fb3912 100644 --- a/hexlib/text.py +++ b/hexlib/text.py @@ -1,4 +1,5 @@ from functools import partial +from itertools import chain, repeat from multiprocessing.pool import Pool import nltk.corpus @@ -36,9 +37,9 @@ def clean_multicore(texts, processes, chunk_size=10000, **kwargs): def _transform_bigram(ngram_seq, ngrams): for ngram in ngram_seq: if ngram in ngrams: - yield "_".join(ngram) + yield ngram[0] + "_" + ngram[1] - ngram_seq.__next__() + next(ngram_seq) else: yield ngram[0] @@ -46,23 +47,27 @@ def _transform_bigram(ngram_seq, ngrams): def _transform_trigram(ngram_seq, ngrams): for ngram in ngram_seq: if ngram in ngrams: + # yield ngram[0] + "_" + ngram[1] + "_" + ngram[2] yield "_".join(ngram) - ngram_seq.__next__() - ngram_seq.__next__() + next(ngram_seq) + next(ngram_seq) else: yield ngram[0] -def preprocess(text, lowercase=False, clean_html=False, strip=False, remove_punctuation=False, - remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False, - remove_urls=False, bigrams: set = None, trigrams: set = None, remove_numbers=False): +SINGLE_QUOTES = ("’", "`") +SINGLE_QUOTE_TRANS = str.maketrans("".join(SINGLE_QUOTES), "".join(repeat("'", len(SINGLE_QUOTES)))) + + +def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False, remove_stopwords_en=False, + lemmatize=False, fix_single_quotes=False, strip_quotes=False, remove_urls=False, bigrams: set = None, + trigrams: set = None, remove_numbers=False): if lowercase: text = text.lower() if fix_single_quotes: - text = text.replace("`", "'") - text = text.replace("’", "'") + text = text.translate(SINGLE_QUOTE_TRANS) if remove_urls: text = LINK_RE.sub(" ", text) @@ -81,39 +86,24 @@ def preprocess(text, lowercase=False, clean_html=False, strip=False, remove_punc if remove_punctuation: text = PUNCTUATION_RE.sub(" ", text) - text = WHITESPACE_RE.sub(" ", text) + words = WHITESPACE_RE.sub(" ", text).split(" ") if strip_quotes: - words = text.split(" ") - text = " ".join(w.strip("\"'") for w in words) + words = filter(lambda w: w.strip("\"'"), words) if bigrams: - words = text.split(" ") - words.append("*") - text = " ".join(_transform_bigram(nltk.bigrams(words), bigrams)) + words = _transform_bigram(nltk.bigrams(chain(words, ("*",))), bigrams) if trigrams: - words = text.split(" ") - words.append("*") - words.append("*") - text = " ".join(_transform_trigram(nltk.trigrams(words), trigrams)) + words = _transform_trigram(nltk.trigrams(chain(words, ("*", "*"))), trigrams) - if remove_stopwords_en or lemmatize or remove_numbers: - words = text.split(" ") + if remove_numbers: + words = filter(lambda w: not w.isnumeric(), words) - if remove_numbers: - words = filter(lambda w: not w.isnumeric(), words) + if lemmatize: + words = map(lambda w: lemmatizer.lemmatize(w), words) - if not lemmatize and not remove_stopwords_en: - text = " ".join(words) - if lemmatize and remove_stopwords_en: - text = " ".join(lemmatizer.lemmatize(w) for w in words if w not in stop_words_en) - elif not lemmatize and remove_stopwords_en: - text = " ".join(w for w in words if w not in stop_words_en) - elif lemmatize and not remove_stopwords_en: - text = " ".join(lemmatizer.lemmatize(w) for w in words) + if remove_stopwords_en: + words = filter(lambda w: w not in stop_words_en, words) - if strip: - text = text.strip() - - return text + return filter(lambda w: w != "", words) diff --git a/setup.py b/setup.py index 7f1cf17..534d8a3 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup setup( name="hexlib", - version="1.50", + version="1.51", description="Misc utility methods", author="simon987", author_email="me@simon987.net", diff --git a/test/test_text.py b/test/test_text.py index 54546bd..e10e1bd 100644 --- a/test/test_text.py +++ b/test/test_text.py @@ -13,7 +13,7 @@ class TestText(TestCase): ) expected = "" - self.assertEqual(cleaned, expected) + self.assertEqual(" ".join(cleaned), expected) def test_html_1(self): text = "
Hello, world
" @@ -23,7 +23,7 @@ class TestText(TestCase): ) expected = "Hello, world" - self.assertEqual(cleaned, expected) + self.assertEqual(" ".join(cleaned), expected) def test_html_2(self): text = "
Hello, world
" @@ -34,18 +34,7 @@ class TestText(TestCase): ) expected = "hello, world" - self.assertEqual(cleaned, expected) - - def test_html_3(self): - text = "
\n Hello, \t\n world \n\t
" - cleaned = preprocess( - text, - clean_html=True, - lowercase=True, - ) - expected = " hello, world " - - self.assertEqual(cleaned, expected) + self.assertEqual(" ".join(cleaned), expected) def test_html_4(self): text = "
\n Hello, \t\n world \n\t
" @@ -53,11 +42,10 @@ class TestText(TestCase): text, clean_html=True, lowercase=True, - strip=True ) expected = "hello, world" - self.assertEqual(cleaned, expected) + self.assertEqual(" ".join(cleaned), expected) def test_html_5(self): text = "
\n Hello, \t\n world \n\t
" @@ -65,12 +53,11 @@ class TestText(TestCase): text, clean_html=True, lowercase=True, - strip=True, remove_punctuation=True ) expected = "hello world" - self.assertEqual(cleaned, expected) + self.assertEqual(" ".join(cleaned), expected) def test_html_6(self): text = "
\n Hello, \t\na the world \n\t
" @@ -79,12 +66,11 @@ class TestText(TestCase): clean_html=True, lowercase=True, remove_punctuation=True, - strip=True, remove_stopwords_en=True ) expected = "hello world" - self.assertEqual(cleaned, expected) + self.assertEqual(" ".join(cleaned), expected) def test_html_7(self): text = "
\n Hello, \t\na the worlds \n\t
" @@ -93,13 +79,12 @@ class TestText(TestCase): clean_html=True, lowercase=True, remove_punctuation=True, - strip=True, remove_stopwords_en=True, lemmatize=True ) expected = "hello world" - self.assertEqual(cleaned, expected) + self.assertEqual(" ".join(cleaned), expected) def test_html_8(self): text = "
\n Hello, \t\na the worlds! \n\t
" @@ -108,13 +93,12 @@ class TestText(TestCase): clean_html=True, lowercase=True, remove_punctuation=True, - strip=True, remove_stopwords_en=True, lemmatize=True ) expected = "hello world" - self.assertEqual(cleaned, expected) + self.assertEqual(" ".join(cleaned), expected) def test_html_9(self): text = "
\n Hello, \t\nworld! it's it`s \n\t
" @@ -123,13 +107,12 @@ class TestText(TestCase): clean_html=True, lowercase=True, remove_punctuation=True, - strip=True, lemmatize=True, fix_single_quotes=True ) expected = "hello world it's it's" - self.assertEqual(cleaned, expected) + self.assertEqual(" ".join(cleaned), expected) def test_single_quote(self): text = "it's it`s it’s" @@ -140,7 +123,7 @@ class TestText(TestCase): ) expected = "it's it's it's" - self.assertEqual(cleaned, expected) + self.assertEqual(" ".join(cleaned), expected) def test_html_10(self): text = "
\n Hello, \t\nworld! it's it`s https://google.ca/test/abc.pdf \n\t
" @@ -149,14 +132,13 @@ class TestText(TestCase): clean_html=True, lowercase=True, remove_punctuation=True, - strip=True, lemmatize=True, fix_single_quotes=True, remove_urls=True ) expected = "hello world it's it's" - self.assertEqual(cleaned, expected) + self.assertEqual(" ".join(cleaned), expected) def test_html_11(self): text = "
\n Hello, \t\nworld! it's it`s & | \n\t
" @@ -165,7 +147,6 @@ class TestText(TestCase): clean_html=True, lowercase=True, remove_punctuation=True, - strip=True, lemmatize=True, fix_single_quotes=True, remove_stopwords_en=True, @@ -173,7 +154,7 @@ class TestText(TestCase): ) expected = "hello world" - self.assertEqual(cleaned, expected) + self.assertEqual(" ".join(cleaned), expected) def test_html_no_root(self): text = ">>217709510
Is there aservant that is against civilization and humanity?
Literally instant summon." @@ -183,7 +164,6 @@ class TestText(TestCase): clean_html=True, lowercase=True, remove_punctuation=True, - strip=True, lemmatize=False, fix_single_quotes=True, remove_stopwords_en=False, @@ -191,7 +171,7 @@ class TestText(TestCase): ) expected = "217709510 is there a servant that is against civilization and humanity literally instant summon" - self.assertEqual(cleaned, expected) + self.assertEqual(" ".join(cleaned), expected) def test_html_entity(self): text = "doesn't" @@ -201,7 +181,6 @@ class TestText(TestCase): clean_html=True, lowercase=True, remove_punctuation=True, - strip=True, lemmatize=False, fix_single_quotes=True, remove_stopwords_en=False, @@ -209,7 +188,7 @@ class TestText(TestCase): ) expected = "doesn't" - self.assertEqual(cleaned, expected) + self.assertEqual(" ".join(cleaned), expected) def test_html_invalid_attribute(self): text = '' @@ -219,7 +198,6 @@ class TestText(TestCase): clean_html=True, lowercase=True, remove_punctuation=True, - strip=True, lemmatize=False, fix_single_quotes=True, remove_stopwords_en=False, @@ -228,7 +206,7 @@ class TestText(TestCase): expected = "" - self.assertEqual(cleaned, expected) + self.assertEqual(" ".join(cleaned), expected) def test_bigrams(self): text = "x A b c d e f g h" @@ -243,7 +221,7 @@ class TestText(TestCase): ) expected = "x a_b c_d e f_g h" - self.assertEqual(cleaned, expected) + self.assertEqual(" ".join(cleaned), expected) def test_trigrams(self): text = "x A b c d e f g h" @@ -257,7 +235,7 @@ class TestText(TestCase): ) expected = "x a_b_c d e_f_g h" - self.assertEqual(cleaned, expected) + self.assertEqual(" ".join(cleaned), expected) def test_remove_numbers(self): text = "Hello1 test1124test 12 1 1111111 world" @@ -268,4 +246,4 @@ class TestText(TestCase): ) expected = "hello1 test1124test world" - self.assertEqual(cleaned, expected) + self.assertEqual(" ".join(cleaned), expected)