mirror of
https://github.com/simon987/hexlib.git
synced 2025-04-20 18:26:43 +00:00
Compare commits
2 Commits
32119535ae
...
88f3124f85
Author | SHA1 | Date | |
---|---|---|---|
88f3124f85 | |||
8edad0255b |
@ -45,6 +45,7 @@ def get_web(session=None):
|
|||||||
logger=stdout_logger,
|
logger=stdout_logger,
|
||||||
cookie_file=os.environ.get("COOKIE_FILE", None),
|
cookie_file=os.environ.get("COOKIE_FILE", None),
|
||||||
retry_codes=set(int(x) if x else None for x in os.environ.get("RETRY_CODES", "").split(",")),
|
retry_codes=set(int(x) if x else None for x in os.environ.get("RETRY_CODES", "").split(",")),
|
||||||
|
retries=int(os.environ.get("RETRIES", 3)),
|
||||||
retry_sleep=int(os.environ.get("RETRY_SLEEP", 0)),
|
retry_sleep=int(os.environ.get("RETRY_SLEEP", 0)),
|
||||||
ua=ua[os.environ.get("USER_AGENT")] if os.environ.get("USER_AGENT", None) is not None else None
|
ua=ua[os.environ.get("USER_AGENT")] if os.environ.get("USER_AGENT", None) is not None else None
|
||||||
)
|
)
|
||||||
|
@ -32,9 +32,22 @@ def clean_multicore(texts, processes, **kwargs):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _transform_bigram(ngram_seq, ngrams):
|
||||||
|
for ngram in ngram_seq:
|
||||||
|
if ngram in ngrams:
|
||||||
|
yield "_".join(ngram)
|
||||||
|
|
||||||
|
ngram_seq.__next__()
|
||||||
|
else:
|
||||||
|
yield ngram[0]
|
||||||
|
|
||||||
|
|
||||||
def clean(text, lowercase=False, clean_html=False, strip=False, remove_punctuation=False,
|
def clean(text, lowercase=False, clean_html=False, strip=False, remove_punctuation=False,
|
||||||
remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False,
|
remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False,
|
||||||
remove_urls=False):
|
remove_urls=False, bigrams: set = None):
|
||||||
|
if lowercase:
|
||||||
|
text = text.lower()
|
||||||
|
|
||||||
if fix_single_quotes:
|
if fix_single_quotes:
|
||||||
text = text.replace("`", "'")
|
text = text.replace("`", "'")
|
||||||
|
|
||||||
@ -51,9 +64,6 @@ def clean(text, lowercase=False, clean_html=False, strip=False, remove_punctuati
|
|||||||
if remove_punctuation:
|
if remove_punctuation:
|
||||||
text = PUNCTUATION_RE.sub(" ", text)
|
text = PUNCTUATION_RE.sub(" ", text)
|
||||||
|
|
||||||
if lowercase:
|
|
||||||
text = text.lower()
|
|
||||||
|
|
||||||
if not remove_stopwords_en or not lemmatize or not strip_quotes:
|
if not remove_stopwords_en or not lemmatize or not strip_quotes:
|
||||||
text = WHITESPACE_RE.sub(" ", text)
|
text = WHITESPACE_RE.sub(" ", text)
|
||||||
|
|
||||||
@ -61,6 +71,11 @@ def clean(text, lowercase=False, clean_html=False, strip=False, remove_punctuati
|
|||||||
words = WHITESPACE_RE.split(text)
|
words = WHITESPACE_RE.split(text)
|
||||||
text = " ".join(w.strip("\"'") for w in words)
|
text = " ".join(w.strip("\"'") for w in words)
|
||||||
|
|
||||||
|
if bigrams:
|
||||||
|
words = WHITESPACE_RE.split(text)
|
||||||
|
words.append("*")
|
||||||
|
text = " ".join(_transform_bigram(nltk.bigrams(words), bigrams))
|
||||||
|
|
||||||
if remove_stopwords_en or lemmatize:
|
if remove_stopwords_en or lemmatize:
|
||||||
words = WHITESPACE_RE.split(text)
|
words = WHITESPACE_RE.split(text)
|
||||||
|
|
||||||
|
2
setup.py
2
setup.py
@ -2,7 +2,7 @@ from setuptools import setup
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="hexlib",
|
name="hexlib",
|
||||||
version="1.40",
|
version="1.41",
|
||||||
description="Misc utility methods",
|
description="Misc utility methods",
|
||||||
author="simon987",
|
author="simon987",
|
||||||
author_email="me@simon987.net",
|
author_email="me@simon987.net",
|
||||||
|
@ -148,7 +148,7 @@ class TestText(TestCase):
|
|||||||
self.assertEqual(cleaned, expected)
|
self.assertEqual(cleaned, expected)
|
||||||
|
|
||||||
def test_html_11(self):
|
def test_html_11(self):
|
||||||
text = "<div>\n Hello, \t\n<strong>world! it's it`s u & | </strong>\n\t</div>"
|
text = "<div>\n Hello, \t\n<strong>world! it's it`s u us & | </strong>\n\t</div>"
|
||||||
cleaned = clean(
|
cleaned = clean(
|
||||||
text,
|
text,
|
||||||
clean_html=True,
|
clean_html=True,
|
||||||
@ -163,3 +163,18 @@ class TestText(TestCase):
|
|||||||
expected = "hello world"
|
expected = "hello world"
|
||||||
|
|
||||||
self.assertEqual(cleaned, expected)
|
self.assertEqual(cleaned, expected)
|
||||||
|
|
||||||
|
def test_bigrams(self):
|
||||||
|
text = "x A b c d e f g h"
|
||||||
|
cleaned = clean(
|
||||||
|
text,
|
||||||
|
lowercase=True,
|
||||||
|
bigrams={
|
||||||
|
("a", "b"),
|
||||||
|
("c", "d"),
|
||||||
|
("f", "g"),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
expected = "x a_b c_d e f_g h"
|
||||||
|
|
||||||
|
self.assertEqual(cleaned, expected)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user