mirror of
https://github.com/simon987/hexlib.git
synced 2025-04-17 09:06:42 +00:00
Split punctuation into punctuation and special_punctuation
This commit is contained in:
parent
084acbe184
commit
c9fac7151a
@ -53,13 +53,16 @@ SINGLE_QUOTE_TRANS = str.maketrans("".join(SINGLE_QUOTES), "".join(repeat("'", l
|
|||||||
DASHES = ("–", "⸺", "–", "—")
|
DASHES = ("–", "⸺", "–", "—")
|
||||||
DASHES_TRANS = str.maketrans("".join(DASHES), "".join(repeat("-", len(DASHES))))
|
DASHES_TRANS = str.maketrans("".join(DASHES), "".join(repeat("-", len(DASHES))))
|
||||||
|
|
||||||
PUNCTUATION = ".,;:\"!?/()|*=>"
|
SPECIAL_PUNCTUATION = ";:\"/()|*=>"
|
||||||
|
SPECIAL_PUNCTUATION_TRANS = str.maketrans(SPECIAL_PUNCTUATION, " " * len(SPECIAL_PUNCTUATION))
|
||||||
|
|
||||||
|
PUNCTUATION = ".,!?"
|
||||||
PUNCTUATION_TRANS = str.maketrans(PUNCTUATION, " " * len(PUNCTUATION))
|
PUNCTUATION_TRANS = str.maketrans(PUNCTUATION, " " * len(PUNCTUATION))
|
||||||
|
|
||||||
|
|
||||||
def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False, remove_stopwords_en=False,
|
def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False, remove_special_punctuation=False,
|
||||||
lemmatize=False, fix_single_quotes=False, strip_quotes=False, remove_urls=False, bigrams: set = None,
|
remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False,
|
||||||
trigrams: set = None, remove_numbers=False):
|
remove_urls=False, bigrams: set = None, trigrams: set = None, remove_numbers=False):
|
||||||
if lowercase:
|
if lowercase:
|
||||||
text = text.lower()
|
text = text.lower()
|
||||||
|
|
||||||
@ -85,6 +88,9 @@ def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False
|
|||||||
if remove_punctuation:
|
if remove_punctuation:
|
||||||
text = text.translate(PUNCTUATION_TRANS)
|
text = text.translate(PUNCTUATION_TRANS)
|
||||||
|
|
||||||
|
if remove_special_punctuation:
|
||||||
|
text = text.translate(SPECIAL_PUNCTUATION_TRANS)
|
||||||
|
|
||||||
words = text.split()
|
words = text.split()
|
||||||
|
|
||||||
if strip_quotes:
|
if strip_quotes:
|
||||||
|
2
setup.py
2
setup.py
@ -2,7 +2,7 @@ from setuptools import setup
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="hexlib",
|
name="hexlib",
|
||||||
version="1.75",
|
version="1.76",
|
||||||
description="Misc utility methods",
|
description="Misc utility methods",
|
||||||
author="simon987",
|
author="simon987",
|
||||||
author_email="me@simon987.net",
|
author_email="me@simon987.net",
|
||||||
|
Loading…
x
Reference in New Issue
Block a user