Quick optimisation

This commit is contained in:
simon987 2021-09-19 10:57:07 -04:00
parent d19442b00e
commit 7349c9a5f1
3 changed files with 27 additions and 7 deletions

View File

@ -1,10 +1,27 @@
from timeit import timeit
if __name__ == '__main__':
t = bytes.maketrans(b".,;:\"!?/()|*=>", b" ")
def translate(x: str):
arr = x.encode("utf8")
return arr.translate(t).decode("utf8")
if __name__ == '__main__':
res = timeit(
setup="from hexlib.text import preprocess",
stmt='text = "x A b c d e f g h"\ncleaned = preprocess(\n text,\n lowercase=True,\n trigrams={\n ("a", "b", "c"),\n ("e", "f", "g"),\n }\n)'
setup='t = str.maketrans(".,;:\\"!?/()|*=>", " ")',
stmt='x = "Hello, world %123 & *".translate(t)'
)
print(res)
# 0.865953s
print("translate = %fs" % res)
res = timeit(
setup='from text import translate',
stmt='x = translate("Hello, world %123 & *")'
)
# 0.865953s
print("custom = %fs" % res)

View File

@ -59,6 +59,9 @@ def _transform_trigram(ngram_seq, ngrams):
SINGLE_QUOTES = ("", "`")
SINGLE_QUOTE_TRANS = str.maketrans("".join(SINGLE_QUOTES), "".join(repeat("'", len(SINGLE_QUOTES))))
PUNCTUATION = ".,;:\"!?/()|*=>"
PUNCTUATION_TRANS = str.maketrans(PUNCTUATION, len(PUNCTUATION))
def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False, remove_stopwords_en=False,
lemmatize=False, fix_single_quotes=False, strip_quotes=False, remove_urls=False, bigrams: set = None,
@ -84,9 +87,9 @@ def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False
pass
if remove_punctuation:
text = PUNCTUATION_RE.sub(" ", text)
text = text.translate(PUNCTUATION_TRANS)
words = WHITESPACE_RE.sub(" ", text).split(" ")
words = text.split()
if strip_quotes:
words = filter(lambda w: w.strip("\"'"), words)

View File

@ -2,7 +2,7 @@ from setuptools import setup
setup(
name="hexlib",
version="1.51",
version="1.52",
description="Misc utility methods",
author="simon987",
author_email="me@simon987.net",