Update fix_single_quotes

This commit is contained in:
simon987 2021-08-28 20:48:00 -04:00
parent 60273fb6bd
commit 7e0ffafb8c
4 changed files with 14 additions and 2 deletions

View File

@ -3,5 +3,5 @@ import re
LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))")
HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"")
WHITESPACE_RE = re.compile(r"\s+")
PUNCTUATION_RE = re.compile(r"[.,;:\"!?/()|*=>]+")
PUNCTUATION_RE = re.compile(r"[.,;:\"!?/()|*=>]+")
XML_ENTITY_RE = re.compile(r"&[a-z]+;")

View File

@ -51,6 +51,7 @@ def preprocess(text, lowercase=False, clean_html=False, strip=False, remove_punc
if fix_single_quotes:
text = text.replace("`", "'")
text = text.replace("", "'")
if remove_urls:
text = LINK_RE.sub(" ", text)

View File

@ -2,7 +2,7 @@ from setuptools import setup
setup(
name="hexlib",
version="1.48",
version="1.49",
description="Misc utility methods",
author="simon987",
author_email="me@simon987.net",

View File

@ -130,6 +130,17 @@ class TestText(TestCase):
expected = "hello world it's it's"
self.assertEqual(cleaned, expected)
def test_single_quote(self):
text = "it's it`s its"
cleaned = preprocess(
text,
lowercase=True,
fix_single_quotes=True
)
expected = "it's it's it's"
self.assertEqual(cleaned, expected)
def test_html_10(self):
text = "<div>\n Hello, \t\n<strong>world! it's it`s https://google.ca/test/abc.pdf </strong>\n\t</div>"