From 765f6f59b7164cb80a6f21157530c613784411fc Mon Sep 17 00:00:00 2001 From: simon987 Date: Sun, 18 Apr 2021 12:12:31 -0400 Subject: [PATCH] Add text cleaning function --- hexlib/regex.py | 2 + hexlib/text.py | 54 ++++++++++++++++++++++++ setup.py | 4 +- test/test_text.py | 105 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 163 insertions(+), 2 deletions(-) create mode 100644 hexlib/text.py create mode 100644 test/test_text.py diff --git a/hexlib/regex.py b/hexlib/regex.py index 8c3268e..348e023 100644 --- a/hexlib/regex.py +++ b/hexlib/regex.py @@ -2,3 +2,5 @@ import re LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))") HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"") +WHITESPACE_RE = re.compile(r"\s+") +PUNCTUATION_RE = re.compile(r"[.,;:\"']+") diff --git a/hexlib/text.py b/hexlib/text.py new file mode 100644 index 0000000..9a8c219 --- /dev/null +++ b/hexlib/text.py @@ -0,0 +1,54 @@ +import nltk.corpus +from hexlib.misc import silent_stdout +from lxml import etree +from nltk.corpus import stopwords +from nltk.stem import WordNetLemmatizer + +from .regex import WHITESPACE_RE, PUNCTUATION_RE + +get_text = etree.XPath("//text()") + +stop_words_en = set(stopwords.words("english")) + +with silent_stdout: + nltk.download("stopwords") + nltk.download("wordnet") + +lemmatizer = WordNetLemmatizer() + + +def clean(text, compress_whitespace=False, lowercase=False, clean_html=False, strip=False, remove_punctuation=False, + remove_stopwords_en=False, lemmatize=False): + if compress_whitespace and remove_stopwords_en: + raise ValueError("Redundant flags: remove_stopwords implies compress_whitespace") + + if clean_html: + try: + root = etree.fromstring(text) + text = "".join(get_text(root)) + except: + pass + + if lowercase: + text = text.lower() + + if compress_whitespace: + text = WHITESPACE_RE.sub(" ", text) + + if strip: + text = text.strip() + + if remove_punctuation: + text = PUNCTUATION_RE.sub("", text) + + if remove_stopwords_en or lemmatize: + words = WHITESPACE_RE.split(text) + + if lemmatize and remove_stopwords_en: + text = " ".join(lemmatizer.lemmatize(w) for w in words if w not in stop_words_en) + elif not lemmatize and remove_stopwords_en: + text = " ".join(w for w in words if w not in stop_words_en) + elif lemmatize and not remove_stopwords_en: + text = " ".join(lemmatizer.lemmatize(w) for w in words) + + return text diff --git a/setup.py b/setup.py index 5d65452..371780d 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup setup( name="hexlib", - version="1.38", + version="1.39", description="Misc utility methods", author="simon987", author_email="me@simon987.net", @@ -13,6 +13,6 @@ setup( ]}, install_requires=[ "ImageHash", "influxdb", "siphash", "python-dateutil", "redis", "orjson", "zstandard", - "u-msgpack-python", "psycopg2-binary", "fake-useragent", "bs4" + "u-msgpack-python", "psycopg2-binary", "fake-useragent", "bs4", "lxml", "nltk" ] ) diff --git a/test/test_text.py b/test/test_text.py new file mode 100644 index 0000000..abd1ef6 --- /dev/null +++ b/test/test_text.py @@ -0,0 +1,105 @@ +from unittest import TestCase + +from hexlib.text import clean + + +class TestText(TestCase): + + def test_html_invalid(self): + text = "" + cleaned = clean( + text, + clean_html=True, + ) + expected = "" + + self.assertEqual(cleaned, expected) + + def test_html_1(self): + text = "
Hello, world
" + cleaned = clean( + text, + clean_html=True, + ) + expected = "Hello, world" + + self.assertEqual(cleaned, expected) + + def test_html_2(self): + text = "
Hello, world
" + cleaned = clean( + text, + clean_html=True, + lowercase=True + ) + expected = "hello, world" + + self.assertEqual(cleaned, expected) + + def test_html_3(self): + text = "
\n Hello, \t\n world \n\t
" + cleaned = clean( + text, + clean_html=True, + lowercase=True, + compress_whitespace=True + ) + expected = " hello, world " + + self.assertEqual(cleaned, expected) + + def test_html_4(self): + text = "
\n Hello, \t\n world \n\t
" + cleaned = clean( + text, + clean_html=True, + lowercase=True, + compress_whitespace=True, + strip=True + ) + expected = "hello, world" + + self.assertEqual(cleaned, expected) + + def test_html_5(self): + text = "
\n Hello, \t\n world \n\t
" + cleaned = clean( + text, + clean_html=True, + lowercase=True, + compress_whitespace=True, + strip=True, + remove_punctuation=True + ) + expected = "hello world" + + self.assertEqual(cleaned, expected) + + def test_html_6(self): + text = "
\n Hello, \t\na the world \n\t
" + cleaned = clean( + text, + clean_html=True, + lowercase=True, + remove_punctuation=True, + strip=True, + remove_stopwords_en=True + ) + expected = "hello world" + + self.assertEqual(cleaned, expected) + + def test_html_7(self): + text = "
\n Hello, \t\na the worlds \n\t
" + cleaned = clean( + text, + clean_html=True, + lowercase=True, + remove_punctuation=True, + strip=True, + remove_stopwords_en=True, + lemmatize=True + ) + expected = "hello world" + + self.assertEqual(cleaned, expected)