Compare commits

..

No commits in common. "88f3124f85050a03bce7f53025c5b775fe8f94e5" and "32119535aee2efe7c86f09667c11ab8638379ce9" have entirely different histories.

4 changed files with 6 additions and 37 deletions

View File

@ -45,7 +45,6 @@ def get_web(session=None):
logger=stdout_logger,
cookie_file=os.environ.get("COOKIE_FILE", None),
retry_codes=set(int(x) if x else None for x in os.environ.get("RETRY_CODES", "").split(",")),
retries=int(os.environ.get("RETRIES", 3)),
retry_sleep=int(os.environ.get("RETRY_SLEEP", 0)),
ua=ua[os.environ.get("USER_AGENT")] if os.environ.get("USER_AGENT", None) is not None else None
)

View File

@ -32,22 +32,9 @@ def clean_multicore(texts, processes, **kwargs):
)
def _transform_bigram(ngram_seq, ngrams):
for ngram in ngram_seq:
if ngram in ngrams:
yield "_".join(ngram)
ngram_seq.__next__()
else:
yield ngram[0]
def clean(text, lowercase=False, clean_html=False, strip=False, remove_punctuation=False,
remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False,
remove_urls=False, bigrams: set = None):
if lowercase:
text = text.lower()
remove_urls=False):
if fix_single_quotes:
text = text.replace("`", "'")
@ -64,6 +51,9 @@ def clean(text, lowercase=False, clean_html=False, strip=False, remove_punctuati
if remove_punctuation:
text = PUNCTUATION_RE.sub(" ", text)
if lowercase:
text = text.lower()
if not remove_stopwords_en or not lemmatize or not strip_quotes:
text = WHITESPACE_RE.sub(" ", text)
@ -71,11 +61,6 @@ def clean(text, lowercase=False, clean_html=False, strip=False, remove_punctuati
words = WHITESPACE_RE.split(text)
text = " ".join(w.strip("\"'") for w in words)
if bigrams:
words = WHITESPACE_RE.split(text)
words.append("*")
text = " ".join(_transform_bigram(nltk.bigrams(words), bigrams))
if remove_stopwords_en or lemmatize:
words = WHITESPACE_RE.split(text)

View File

@ -2,7 +2,7 @@ from setuptools import setup
setup(
name="hexlib",
version="1.41",
version="1.40",
description="Misc utility methods",
author="simon987",
author_email="me@simon987.net",

View File

@ -148,7 +148,7 @@ class TestText(TestCase):
self.assertEqual(cleaned, expected)
def test_html_11(self):
text = "<div>\n Hello, \t\n<strong>world! it's it`s u us & | </strong>\n\t</div>"
text = "<div>\n Hello, \t\n<strong>world! it's it`s u & | </strong>\n\t</div>"
cleaned = clean(
text,
clean_html=True,
@ -163,18 +163,3 @@ class TestText(TestCase):
expected = "hello world"
self.assertEqual(cleaned, expected)
def test_bigrams(self):
text = "x A b c d e f g h"
cleaned = clean(
text,
lowercase=True,
bigrams={
("a", "b"),
("c", "d"),
("f", "g"),
}
)
expected = "x a_b c_d e f_g h"
self.assertEqual(cleaned, expected)