mirror of
https://github.com/simon987/hexlib.git
synced 2025-04-03 09:53:00 +00:00
Quick optimisation
This commit is contained in:
parent
d19442b00e
commit
7349c9a5f1
@ -1,10 +1,27 @@
|
||||
from timeit import timeit
|
||||
|
||||
if __name__ == '__main__':
|
||||
t = bytes.maketrans(b".,;:\"!?/()|*=>", b" ")
|
||||
|
||||
|
||||
def translate(x: str):
|
||||
arr = x.encode("utf8")
|
||||
|
||||
return arr.translate(t).decode("utf8")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
res = timeit(
|
||||
setup="from hexlib.text import preprocess",
|
||||
stmt='text = "x A b c d e f g h"\ncleaned = preprocess(\n text,\n lowercase=True,\n trigrams={\n ("a", "b", "c"),\n ("e", "f", "g"),\n }\n)'
|
||||
setup='t = str.maketrans(".,;:\\"!?/()|*=>", " ")',
|
||||
stmt='x = "Hello, world %123 & *".translate(t)'
|
||||
)
|
||||
|
||||
print(res)
|
||||
# 0.865953s
|
||||
print("translate = %fs" % res)
|
||||
|
||||
res = timeit(
|
||||
setup='from text import translate',
|
||||
stmt='x = translate("Hello, world %123 & *")'
|
||||
)
|
||||
|
||||
# 0.865953s
|
||||
print("custom = %fs" % res)
|
||||
|
@ -59,6 +59,9 @@ def _transform_trigram(ngram_seq, ngrams):
|
||||
SINGLE_QUOTES = ("’", "`")
|
||||
SINGLE_QUOTE_TRANS = str.maketrans("".join(SINGLE_QUOTES), "".join(repeat("'", len(SINGLE_QUOTES))))
|
||||
|
||||
PUNCTUATION = ".,;:\"!?/()|*=>"
|
||||
PUNCTUATION_TRANS = str.maketrans(PUNCTUATION, len(PUNCTUATION))
|
||||
|
||||
|
||||
def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False, remove_stopwords_en=False,
|
||||
lemmatize=False, fix_single_quotes=False, strip_quotes=False, remove_urls=False, bigrams: set = None,
|
||||
@ -84,9 +87,9 @@ def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False
|
||||
pass
|
||||
|
||||
if remove_punctuation:
|
||||
text = PUNCTUATION_RE.sub(" ", text)
|
||||
text = text.translate(PUNCTUATION_TRANS)
|
||||
|
||||
words = WHITESPACE_RE.sub(" ", text).split(" ")
|
||||
words = text.split()
|
||||
|
||||
if strip_quotes:
|
||||
words = filter(lambda w: w.strip("\"'"), words)
|
||||
|
Loading…
x
Reference in New Issue
Block a user