mirror of
https://github.com/simon987/hexlib.git
synced 2025-04-10 06:06:41 +00:00
improve text cleaning
This commit is contained in:
parent
45b5803c40
commit
00323ea576
@ -3,4 +3,4 @@ import re
|
||||
LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))")
|
||||
HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"")
|
||||
WHITESPACE_RE = re.compile(r"\s+")
|
||||
PUNCTUATION_RE = re.compile(r"[.,;:\"!?]+")
|
||||
PUNCTUATION_RE = re.compile(r"[.,;:\"!?/]+")
|
||||
|
@ -50,9 +50,6 @@ def clean(text, compress_whitespace=False, lowercase=False, clean_html=False, st
|
||||
if compress_whitespace:
|
||||
text = WHITESPACE_RE.sub(" ", text)
|
||||
|
||||
if strip:
|
||||
text = text.strip()
|
||||
|
||||
if remove_stopwords_en or lemmatize:
|
||||
words = WHITESPACE_RE.split(text)
|
||||
|
||||
@ -63,4 +60,7 @@ def clean(text, compress_whitespace=False, lowercase=False, clean_html=False, st
|
||||
elif lemmatize and not remove_stopwords_en:
|
||||
text = " ".join(lemmatizer.lemmatize(w) for w in words)
|
||||
|
||||
if strip:
|
||||
text = text.strip()
|
||||
|
||||
return text
|
||||
|
Loading…
x
Reference in New Issue
Block a user