mirror of
https://github.com/simon987/hexlib.git
synced 2025-04-21 18:46:42 +00:00
Compare commits
5 Commits
8d8f9e8751
...
90d434ec73
Author | SHA1 | Date | |
---|---|---|---|
90d434ec73 | |||
55fd4a66d2 | |||
3677815d57 | |||
1ce795a759 | |||
e1537297d7 |
@ -56,9 +56,12 @@ def _transform_trigram(ngram_seq, ngrams):
|
|||||||
yield ngram[0]
|
yield ngram[0]
|
||||||
|
|
||||||
|
|
||||||
SINGLE_QUOTES = ("’", "`")
|
SINGLE_QUOTES = ("’", "`", "‘")
|
||||||
SINGLE_QUOTE_TRANS = str.maketrans("".join(SINGLE_QUOTES), "".join(repeat("'", len(SINGLE_QUOTES))))
|
SINGLE_QUOTE_TRANS = str.maketrans("".join(SINGLE_QUOTES), "".join(repeat("'", len(SINGLE_QUOTES))))
|
||||||
|
|
||||||
|
DASHES = ("–", "⸺", "–", "—")
|
||||||
|
DASHES_TRANS = str.maketrans("".join(DASHES), "".join(repeat("-", len(DASHES))))
|
||||||
|
|
||||||
PUNCTUATION = ".,;:\"!?/()|*=>"
|
PUNCTUATION = ".,;:\"!?/()|*=>"
|
||||||
PUNCTUATION_TRANS = str.maketrans(PUNCTUATION, " " * len(PUNCTUATION))
|
PUNCTUATION_TRANS = str.maketrans(PUNCTUATION, " " * len(PUNCTUATION))
|
||||||
|
|
||||||
@ -72,6 +75,8 @@ def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False
|
|||||||
if fix_single_quotes:
|
if fix_single_quotes:
|
||||||
text = text.translate(SINGLE_QUOTE_TRANS)
|
text = text.translate(SINGLE_QUOTE_TRANS)
|
||||||
|
|
||||||
|
text = text.translate(DASHES_TRANS)
|
||||||
|
|
||||||
if remove_urls:
|
if remove_urls:
|
||||||
text = LINK_RE.sub(" ", text)
|
text = LINK_RE.sub(" ", text)
|
||||||
|
|
||||||
@ -92,7 +97,7 @@ def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False
|
|||||||
words = text.split()
|
words = text.split()
|
||||||
|
|
||||||
if strip_quotes:
|
if strip_quotes:
|
||||||
words = filter(lambda w: w.strip("\"'"), words)
|
words = map(lambda w: w.strip("\"'“”"), words)
|
||||||
|
|
||||||
if bigrams:
|
if bigrams:
|
||||||
words = _transform_bigram(nltk.bigrams(chain(words, ("*",))), bigrams)
|
words = _transform_bigram(nltk.bigrams(chain(words, ("*",))), bigrams)
|
||||||
|
2
setup.py
2
setup.py
@ -2,7 +2,7 @@ from setuptools import setup
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="hexlib",
|
name="hexlib",
|
||||||
version="1.66",
|
version="1.70",
|
||||||
description="Misc utility methods",
|
description="Misc utility methods",
|
||||||
author="simon987",
|
author="simon987",
|
||||||
author_email="me@simon987.net",
|
author_email="me@simon987.net",
|
||||||
|
@ -247,3 +247,13 @@ class TestText(TestCase):
|
|||||||
expected = "hello1 test1124test world"
|
expected = "hello1 test1124test world"
|
||||||
|
|
||||||
self.assertEqual(" ".join(cleaned), expected)
|
self.assertEqual(" ".join(cleaned), expected)
|
||||||
|
|
||||||
|
def test_strip_quotes(self):
|
||||||
|
text = "'hi' “test” 'hello\""
|
||||||
|
cleaned = preprocess(
|
||||||
|
text,
|
||||||
|
strip_quotes=True
|
||||||
|
)
|
||||||
|
expected = "hi test hello"
|
||||||
|
|
||||||
|
self.assertEqual(" ".join(cleaned), expected)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user