Compare commits

..

No commits in common. "32119535aee2efe7c86f09667c11ab8638379ce9" and "18cd59fc4a981849799c86a7e22750bb177cad9a" have entirely different histories.

3 changed files with 16 additions and 101 deletions

View File

@ -3,4 +3,4 @@ import re
LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))") LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))")
HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"") HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"")
WHITESPACE_RE = re.compile(r"\s+") WHITESPACE_RE = re.compile(r"\s+")
PUNCTUATION_RE = re.compile(r"[.,;:\"!?/()|*=]+") PUNCTUATION_RE = re.compile(r"[.,;:\"']+")

View File

@ -1,65 +1,43 @@
from functools import partial
from multiprocessing.pool import Pool
import nltk.corpus import nltk.corpus
from lxml import etree from lxml import etree
from nltk.corpus import stopwords from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer from nltk.stem import WordNetLemmatizer
from .regex import WHITESPACE_RE, PUNCTUATION_RE, LINK_RE from .regex import WHITESPACE_RE, PUNCTUATION_RE
get_text = etree.XPath("//text()") get_text = etree.XPath("//text()")
stop_words_en = set(stopwords.words("english")) stop_words_en = set(stopwords.words("english"))
extra_stop_words_en = [
"u", "&", "-", "--"
]
stop_words_en.update(extra_stop_words_en)
nltk.download("stopwords", quiet=True) nltk.download("stopwords", quiet=True)
nltk.download("wordnet", quiet=True) nltk.download("wordnet", quiet=True)
lemmatizer = WordNetLemmatizer() lemmatizer = WordNetLemmatizer()
def clean_multicore(texts, processes, **kwargs): def clean(text, compress_whitespace=False, lowercase=False, clean_html=False, strip=False, remove_punctuation=False,
pool = Pool(processes=processes) remove_stopwords_en=False, lemmatize=False):
return pool.map( if compress_whitespace and remove_stopwords_en:
func=partial(clean, **kwargs), raise ValueError("Redundant flags: remove_stopwords implies compress_whitespace")
iterable=texts,
)
def clean(text, lowercase=False, clean_html=False, strip=False, remove_punctuation=False,
remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False,
remove_urls=False):
if fix_single_quotes:
text = text.replace("`", "'")
if remove_urls:
text = LINK_RE.sub(" ", text)
if clean_html: if clean_html:
try: try:
root = etree.fromstring(text.replace("&", "")) root = etree.fromstring(text)
text = "".join(get_text(root)) text = "".join(get_text(root))
except: except:
pass pass
if remove_punctuation:
text = PUNCTUATION_RE.sub(" ", text)
if lowercase: if lowercase:
text = text.lower() text = text.lower()
if not remove_stopwords_en or not lemmatize or not strip_quotes: if compress_whitespace:
text = WHITESPACE_RE.sub(" ", text) text = WHITESPACE_RE.sub(" ", text)
if strip_quotes: if strip:
words = WHITESPACE_RE.split(text) text = text.strip()
text = " ".join(w.strip("\"'") for w in words)
if remove_punctuation:
text = PUNCTUATION_RE.sub("", text)
if remove_stopwords_en or lemmatize: if remove_stopwords_en or lemmatize:
words = WHITESPACE_RE.split(text) words = WHITESPACE_RE.split(text)
@ -71,7 +49,4 @@ def clean(text, lowercase=False, clean_html=False, strip=False, remove_punctuati
elif lemmatize and not remove_stopwords_en: elif lemmatize and not remove_stopwords_en:
text = " ".join(lemmatizer.lemmatize(w) for w in words) text = " ".join(lemmatizer.lemmatize(w) for w in words)
if strip:
text = text.strip()
return text return text

View File

@ -42,6 +42,7 @@ class TestText(TestCase):
text, text,
clean_html=True, clean_html=True,
lowercase=True, lowercase=True,
compress_whitespace=True
) )
expected = " hello, world " expected = " hello, world "
@ -53,6 +54,7 @@ class TestText(TestCase):
text, text,
clean_html=True, clean_html=True,
lowercase=True, lowercase=True,
compress_whitespace=True,
strip=True strip=True
) )
expected = "hello, world" expected = "hello, world"
@ -65,6 +67,7 @@ class TestText(TestCase):
text, text,
clean_html=True, clean_html=True,
lowercase=True, lowercase=True,
compress_whitespace=True,
strip=True, strip=True,
remove_punctuation=True remove_punctuation=True
) )
@ -100,66 +103,3 @@ class TestText(TestCase):
expected = "hello world" expected = "hello world"
self.assertEqual(cleaned, expected) self.assertEqual(cleaned, expected)
def test_html_8(self):
text = "<div>\n Hello, \t\n<strong>a the worlds! </strong>\n\t</div>"
cleaned = clean(
text,
clean_html=True,
lowercase=True,
remove_punctuation=True,
strip=True,
remove_stopwords_en=True,
lemmatize=True
)
expected = "hello world"
self.assertEqual(cleaned, expected)
def test_html_9(self):
text = "<div>\n Hello, \t\n<strong>world! it's it`s </strong>\n\t</div>"
cleaned = clean(
text,
clean_html=True,
lowercase=True,
remove_punctuation=True,
strip=True,
lemmatize=True,
fix_single_quotes=True
)
expected = "hello world it's it's"
self.assertEqual(cleaned, expected)
def test_html_10(self):
text = "<div>\n Hello, \t\n<strong>world! it's it`s https://google.ca/test/abc.pdf </strong>\n\t</div>"
cleaned = clean(
text,
clean_html=True,
lowercase=True,
remove_punctuation=True,
strip=True,
lemmatize=True,
fix_single_quotes=True,
remove_urls=True
)
expected = "hello world it's it's"
self.assertEqual(cleaned, expected)
def test_html_11(self):
text = "<div>\n Hello, \t\n<strong>world! it's it`s u & | </strong>\n\t</div>"
cleaned = clean(
text,
clean_html=True,
lowercase=True,
remove_punctuation=True,
strip=True,
lemmatize=True,
fix_single_quotes=True,
remove_stopwords_en=True,
remove_urls=True
)
expected = "hello world"
self.assertEqual(cleaned, expected)