mirror of
https://github.com/simon987/hexlib.git
synced 2025-04-04 02:12:59 +00:00
Quick optimisation
This commit is contained in:
parent
d19442b00e
commit
7349c9a5f1
@ -1,10 +1,27 @@
|
|||||||
from timeit import timeit
|
from timeit import timeit
|
||||||
|
|
||||||
if __name__ == '__main__':
|
t = bytes.maketrans(b".,;:\"!?/()|*=>", b" ")
|
||||||
|
|
||||||
|
|
||||||
|
def translate(x: str):
|
||||||
|
arr = x.encode("utf8")
|
||||||
|
|
||||||
|
return arr.translate(t).decode("utf8")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
res = timeit(
|
res = timeit(
|
||||||
setup="from hexlib.text import preprocess",
|
setup='t = str.maketrans(".,;:\\"!?/()|*=>", " ")',
|
||||||
stmt='text = "x A b c d e f g h"\ncleaned = preprocess(\n text,\n lowercase=True,\n trigrams={\n ("a", "b", "c"),\n ("e", "f", "g"),\n }\n)'
|
stmt='x = "Hello, world %123 & *".translate(t)'
|
||||||
)
|
)
|
||||||
|
|
||||||
print(res)
|
# 0.865953s
|
||||||
|
print("translate = %fs" % res)
|
||||||
|
|
||||||
|
res = timeit(
|
||||||
|
setup='from text import translate',
|
||||||
|
stmt='x = translate("Hello, world %123 & *")'
|
||||||
|
)
|
||||||
|
|
||||||
|
# 0.865953s
|
||||||
|
print("custom = %fs" % res)
|
||||||
|
@ -59,6 +59,9 @@ def _transform_trigram(ngram_seq, ngrams):
|
|||||||
SINGLE_QUOTES = ("’", "`")
|
SINGLE_QUOTES = ("’", "`")
|
||||||
SINGLE_QUOTE_TRANS = str.maketrans("".join(SINGLE_QUOTES), "".join(repeat("'", len(SINGLE_QUOTES))))
|
SINGLE_QUOTE_TRANS = str.maketrans("".join(SINGLE_QUOTES), "".join(repeat("'", len(SINGLE_QUOTES))))
|
||||||
|
|
||||||
|
PUNCTUATION = ".,;:\"!?/()|*=>"
|
||||||
|
PUNCTUATION_TRANS = str.maketrans(PUNCTUATION, len(PUNCTUATION))
|
||||||
|
|
||||||
|
|
||||||
def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False, remove_stopwords_en=False,
|
def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False, remove_stopwords_en=False,
|
||||||
lemmatize=False, fix_single_quotes=False, strip_quotes=False, remove_urls=False, bigrams: set = None,
|
lemmatize=False, fix_single_quotes=False, strip_quotes=False, remove_urls=False, bigrams: set = None,
|
||||||
@ -84,9 +87,9 @@ def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
if remove_punctuation:
|
if remove_punctuation:
|
||||||
text = PUNCTUATION_RE.sub(" ", text)
|
text = text.translate(PUNCTUATION_TRANS)
|
||||||
|
|
||||||
words = WHITESPACE_RE.sub(" ", text).split(" ")
|
words = text.split()
|
||||||
|
|
||||||
if strip_quotes:
|
if strip_quotes:
|
||||||
words = filter(lambda w: w.strip("\"'"), words)
|
words = filter(lambda w: w.strip("\"'"), words)
|
||||||
|
2
setup.py
2
setup.py
@ -2,7 +2,7 @@ from setuptools import setup
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="hexlib",
|
name="hexlib",
|
||||||
version="1.51",
|
version="1.52",
|
||||||
description="Misc utility methods",
|
description="Misc utility methods",
|
||||||
author="simon987",
|
author="simon987",
|
||||||
author_email="me@simon987.net",
|
author_email="me@simon987.net",
|
||||||
|
Loading…
x
Reference in New Issue
Block a user