Compare commits

..

5 Commits

Author SHA1 Message Date
90d434ec73 Add more single quotes 2021-11-16 15:32:24 -05:00
55fd4a66d2 Fix strip_quotes 2021-11-16 11:48:23 -05:00
3677815d57 Add more quotes in strip_quotes 2021-11-16 11:39:10 -05:00
1ce795a759 ... 2021-11-16 11:36:17 -05:00
e1537297d7 normalize dashes in preprocess 2021-11-16 11:34:48 -05:00
3 changed files with 18 additions and 3 deletions

View File

@ -56,9 +56,12 @@ def _transform_trigram(ngram_seq, ngrams):
yield ngram[0] yield ngram[0]
SINGLE_QUOTES = ("", "`") SINGLE_QUOTES = ("", "`", "")
SINGLE_QUOTE_TRANS = str.maketrans("".join(SINGLE_QUOTES), "".join(repeat("'", len(SINGLE_QUOTES)))) SINGLE_QUOTE_TRANS = str.maketrans("".join(SINGLE_QUOTES), "".join(repeat("'", len(SINGLE_QUOTES))))
DASHES = ("", "", "", "")
DASHES_TRANS = str.maketrans("".join(DASHES), "".join(repeat("-", len(DASHES))))
PUNCTUATION = ".,;:\"!?/()|*=>" PUNCTUATION = ".,;:\"!?/()|*=>"
PUNCTUATION_TRANS = str.maketrans(PUNCTUATION, " " * len(PUNCTUATION)) PUNCTUATION_TRANS = str.maketrans(PUNCTUATION, " " * len(PUNCTUATION))
@ -72,6 +75,8 @@ def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False
if fix_single_quotes: if fix_single_quotes:
text = text.translate(SINGLE_QUOTE_TRANS) text = text.translate(SINGLE_QUOTE_TRANS)
text = text.translate(DASHES_TRANS)
if remove_urls: if remove_urls:
text = LINK_RE.sub(" ", text) text = LINK_RE.sub(" ", text)
@ -92,7 +97,7 @@ def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False
words = text.split() words = text.split()
if strip_quotes: if strip_quotes:
words = filter(lambda w: w.strip("\"'"), words) words = map(lambda w: w.strip("\"'“”"), words)
if bigrams: if bigrams:
words = _transform_bigram(nltk.bigrams(chain(words, ("*",))), bigrams) words = _transform_bigram(nltk.bigrams(chain(words, ("*",))), bigrams)

View File

@ -2,7 +2,7 @@ from setuptools import setup
setup( setup(
name="hexlib", name="hexlib",
version="1.66", version="1.70",
description="Misc utility methods", description="Misc utility methods",
author="simon987", author="simon987",
author_email="me@simon987.net", author_email="me@simon987.net",

View File

@ -247,3 +247,13 @@ class TestText(TestCase):
expected = "hello1 test1124test world" expected = "hello1 test1124test world"
self.assertEqual(" ".join(cleaned), expected) self.assertEqual(" ".join(cleaned), expected)
def test_strip_quotes(self):
text = "'hi' “test” 'hello\""
cleaned = preprocess(
text,
strip_quotes=True
)
expected = "hi test hello"
self.assertEqual(" ".join(cleaned), expected)