From 7349c9a5f134877b3df26c8bcbf4b7e6a56a68d1 Mon Sep 17 00:00:00 2001 From: simon987 Date: Sun, 19 Sep 2021 10:57:07 -0400 Subject: [PATCH] Quick optimisation --- bench/text.py | 25 +++++++++++++++++++++---- hexlib/text.py | 7 +++++-- setup.py | 2 +- 3 files changed, 27 insertions(+), 7 deletions(-) diff --git a/bench/text.py b/bench/text.py index a9ce01c..cce82c0 100644 --- a/bench/text.py +++ b/bench/text.py @@ -1,10 +1,27 @@ from timeit import timeit -if __name__ == '__main__': +t = bytes.maketrans(b".,;:\"!?/()|*=>", b" ") + +def translate(x: str): + arr = x.encode("utf8") + + return arr.translate(t).decode("utf8") + + +if __name__ == '__main__': res = timeit( - setup="from hexlib.text import preprocess", - stmt='text = "x A b c d e f g h"\ncleaned = preprocess(\n text,\n lowercase=True,\n trigrams={\n ("a", "b", "c"),\n ("e", "f", "g"),\n }\n)' + setup='t = str.maketrans(".,;:\\"!?/()|*=>", " ")', + stmt='x = "Hello, world %123 & *".translate(t)' ) - print(res) \ No newline at end of file + # 0.865953s + print("translate = %fs" % res) + + res = timeit( + setup='from text import translate', + stmt='x = translate("Hello, world %123 & *")' + ) + + # 0.865953s + print("custom = %fs" % res) diff --git a/hexlib/text.py b/hexlib/text.py index 9fb3912..16d5b5f 100644 --- a/hexlib/text.py +++ b/hexlib/text.py @@ -59,6 +59,9 @@ def _transform_trigram(ngram_seq, ngrams): SINGLE_QUOTES = ("’", "`") SINGLE_QUOTE_TRANS = str.maketrans("".join(SINGLE_QUOTES), "".join(repeat("'", len(SINGLE_QUOTES)))) +PUNCTUATION = ".,;:\"!?/()|*=>" +PUNCTUATION_TRANS = str.maketrans(PUNCTUATION, len(PUNCTUATION)) + def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False, remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False, remove_urls=False, bigrams: set = None, @@ -84,9 +87,9 @@ def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False pass if remove_punctuation: - text = PUNCTUATION_RE.sub(" ", text) + text = text.translate(PUNCTUATION_TRANS) - words = WHITESPACE_RE.sub(" ", text).split(" ") + words = text.split() if strip_quotes: words = filter(lambda w: w.strip("\"'"), words) diff --git a/setup.py b/setup.py index 534d8a3..6465cb2 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup setup( name="hexlib", - version="1.51", + version="1.52", description="Misc utility methods", author="simon987", author_email="me@simon987.net",