mirror of
https://github.com/simon987/hexlib.git
synced 2025-04-10 06:06:41 +00:00
Add strip_dashes option in preprocess()
This commit is contained in:
parent
c9fac7151a
commit
4befc3973d
@ -1,3 +1,4 @@
|
|||||||
|
import re
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from itertools import chain, repeat
|
from itertools import chain, repeat
|
||||||
from multiprocessing.pool import Pool
|
from multiprocessing.pool import Pool
|
||||||
@ -53,6 +54,8 @@ SINGLE_QUOTE_TRANS = str.maketrans("".join(SINGLE_QUOTES), "".join(repeat("'", l
|
|||||||
DASHES = ("–", "⸺", "–", "—")
|
DASHES = ("–", "⸺", "–", "—")
|
||||||
DASHES_TRANS = str.maketrans("".join(DASHES), "".join(repeat("-", len(DASHES))))
|
DASHES_TRANS = str.maketrans("".join(DASHES), "".join(repeat("-", len(DASHES))))
|
||||||
|
|
||||||
|
DASHES_RE = re.compile(r"-+")
|
||||||
|
|
||||||
SPECIAL_PUNCTUATION = ";:\"/()|*=>"
|
SPECIAL_PUNCTUATION = ";:\"/()|*=>"
|
||||||
SPECIAL_PUNCTUATION_TRANS = str.maketrans(SPECIAL_PUNCTUATION, " " * len(SPECIAL_PUNCTUATION))
|
SPECIAL_PUNCTUATION_TRANS = str.maketrans(SPECIAL_PUNCTUATION, " " * len(SPECIAL_PUNCTUATION))
|
||||||
|
|
||||||
@ -62,6 +65,7 @@ PUNCTUATION_TRANS = str.maketrans(PUNCTUATION, " " * len(PUNCTUATION))
|
|||||||
|
|
||||||
def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False, remove_special_punctuation=False,
|
def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False, remove_special_punctuation=False,
|
||||||
remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False,
|
remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False,
|
||||||
|
strip_dashes=False,
|
||||||
remove_urls=False, bigrams: set = None, trigrams: set = None, remove_numbers=False):
|
remove_urls=False, bigrams: set = None, trigrams: set = None, remove_numbers=False):
|
||||||
if lowercase:
|
if lowercase:
|
||||||
text = text.lower()
|
text = text.lower()
|
||||||
@ -71,6 +75,9 @@ def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False
|
|||||||
|
|
||||||
text = text.translate(DASHES_TRANS)
|
text = text.translate(DASHES_TRANS)
|
||||||
|
|
||||||
|
if strip_dashes:
|
||||||
|
text = DASHES_RE.sub("-", text)
|
||||||
|
|
||||||
if remove_urls:
|
if remove_urls:
|
||||||
text = LINK_RE.sub(" ", text)
|
text = LINK_RE.sub(" ", text)
|
||||||
|
|
||||||
@ -96,6 +103,9 @@ def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False
|
|||||||
if strip_quotes:
|
if strip_quotes:
|
||||||
words = map(lambda w: w.strip("\"'“”"), words)
|
words = map(lambda w: w.strip("\"'“”"), words)
|
||||||
|
|
||||||
|
if strip_dashes:
|
||||||
|
words = map(lambda w: w.strip("-"), words)
|
||||||
|
|
||||||
if bigrams:
|
if bigrams:
|
||||||
words = _transform_bigram(nltk.bigrams(chain(words, ("*",))), bigrams)
|
words = _transform_bigram(nltk.bigrams(chain(words, ("*",))), bigrams)
|
||||||
|
|
||||||
|
2
setup.py
2
setup.py
@ -2,7 +2,7 @@ from setuptools import setup
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="hexlib",
|
name="hexlib",
|
||||||
version="1.76",
|
version="1.77",
|
||||||
description="Misc utility methods",
|
description="Misc utility methods",
|
||||||
author="simon987",
|
author="simon987",
|
||||||
author_email="me@simon987.net",
|
author_email="me@simon987.net",
|
||||||
|
@ -152,7 +152,7 @@ class TestText(TestCase):
|
|||||||
remove_stopwords_en=True,
|
remove_stopwords_en=True,
|
||||||
remove_urls=True
|
remove_urls=True
|
||||||
)
|
)
|
||||||
expected = "hello world"
|
expected = "hello world |"
|
||||||
|
|
||||||
self.assertEqual(" ".join(cleaned), expected)
|
self.assertEqual(" ".join(cleaned), expected)
|
||||||
|
|
||||||
@ -170,7 +170,7 @@ class TestText(TestCase):
|
|||||||
remove_urls=False
|
remove_urls=False
|
||||||
)
|
)
|
||||||
|
|
||||||
expected = "217709510 is there a servant that is against civilization and humanity literally instant summon"
|
expected = ">>217709510 is there a servant that is against civilization and humanity literally instant summon"
|
||||||
self.assertEqual(" ".join(cleaned), expected)
|
self.assertEqual(" ".join(cleaned), expected)
|
||||||
|
|
||||||
def test_html_entity(self):
|
def test_html_entity(self):
|
||||||
@ -257,3 +257,13 @@ class TestText(TestCase):
|
|||||||
expected = "hi test hello"
|
expected = "hi test hello"
|
||||||
|
|
||||||
self.assertEqual(" ".join(cleaned), expected)
|
self.assertEqual(" ".join(cleaned), expected)
|
||||||
|
|
||||||
|
def test_strip_dashes(self):
|
||||||
|
text = "yes -But something-something -- hello aa--bb"
|
||||||
|
cleaned = preprocess(
|
||||||
|
text,
|
||||||
|
strip_dashes=True
|
||||||
|
)
|
||||||
|
expected = "yes But something-something hello aa-bb"
|
||||||
|
|
||||||
|
self.assertEqual(" ".join(cleaned), expected)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user