mirror of
https://github.com/simon987/hexlib.git
synced 2025-04-04 02:12:59 +00:00
Update fix_single_quotes
This commit is contained in:
parent
60273fb6bd
commit
7e0ffafb8c
@ -3,5 +3,5 @@ import re
|
||||
LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))")
|
||||
HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"")
|
||||
WHITESPACE_RE = re.compile(r"\s+")
|
||||
PUNCTUATION_RE = re.compile(r"[.,;:\"!?/()|*=>]+")
|
||||
PUNCTUATION_RE = re.compile(r"[.,;:\"“!?/()|*=>]+")
|
||||
XML_ENTITY_RE = re.compile(r"&[a-z]+;")
|
||||
|
@ -51,6 +51,7 @@ def preprocess(text, lowercase=False, clean_html=False, strip=False, remove_punc
|
||||
|
||||
if fix_single_quotes:
|
||||
text = text.replace("`", "'")
|
||||
text = text.replace("’", "'")
|
||||
|
||||
if remove_urls:
|
||||
text = LINK_RE.sub(" ", text)
|
||||
|
2
setup.py
2
setup.py
@ -2,7 +2,7 @@ from setuptools import setup
|
||||
|
||||
setup(
|
||||
name="hexlib",
|
||||
version="1.48",
|
||||
version="1.49",
|
||||
description="Misc utility methods",
|
||||
author="simon987",
|
||||
author_email="me@simon987.net",
|
||||
|
@ -130,6 +130,17 @@ class TestText(TestCase):
|
||||
expected = "hello world it's it's"
|
||||
|
||||
self.assertEqual(cleaned, expected)
|
||||
|
||||
def test_single_quote(self):
|
||||
text = "it's it`s it’s"
|
||||
cleaned = preprocess(
|
||||
text,
|
||||
lowercase=True,
|
||||
fix_single_quotes=True
|
||||
)
|
||||
expected = "it's it's it's"
|
||||
|
||||
self.assertEqual(cleaned, expected)
|
||||
|
||||
def test_html_10(self):
|
||||
text = "<div>\n Hello, \t\n<strong>world! it's it`s https://google.ca/test/abc.pdf </strong>\n\t</div>"
|
||||
|
Loading…
x
Reference in New Issue
Block a user