Compare commits

...

4 Commits

Author SHA1 Message Date
18cd59fc4a ignore log in text 2021-04-18 12:20:22 -04:00
d895ac837e ignore log in text 2021-04-18 12:18:27 -04:00
ae59522b27 Add customstderr 2021-04-18 12:17:00 -04:00
765f6f59b7 Add text cleaning function 2021-04-18 12:12:31 -04:00
5 changed files with 180 additions and 4 deletions

View File

@ -1,9 +1,8 @@
import atexit
import os import os
import sys import sys
import time import time
from threading import Lock from threading import Lock
import atexit
from time import sleep from time import sleep
import siphash import siphash
@ -23,7 +22,9 @@ def retry(attempts, callback=None, retry_sleep=0):
callback(e) callback(e)
retries -= 1 retries -= 1
sleep(retry_sleep) sleep(retry_sleep)
return wrapper return wrapper
return decorate return decorate
@ -105,4 +106,20 @@ class CustomStdOut:
self.fp.close() self.fp.close()
class CustomStdErr:
original_stderr = sys.stderr
def __init__(self, fname):
self.fname = fname
def __enter__(self):
self.fp = open(self.fname, "w")
sys.stderr = self.fp
def __exit__(self, exc_type, exc_val, exc_tb):
sys.stdout = CustomStdErr.original_stderr
self.fp.close()
silent_stdout = CustomStdOut(os.devnull) silent_stdout = CustomStdOut(os.devnull)
silent_stderr = CustomStdErr(os.devnull)

View File

@ -2,3 +2,5 @@ import re
LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))") LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))")
HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"") HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"")
WHITESPACE_RE = re.compile(r"\s+")
PUNCTUATION_RE = re.compile(r"[.,;:\"']+")

52
hexlib/text.py Normal file
View File

@ -0,0 +1,52 @@
import nltk.corpus
from lxml import etree
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from .regex import WHITESPACE_RE, PUNCTUATION_RE
get_text = etree.XPath("//text()")
stop_words_en = set(stopwords.words("english"))
nltk.download("stopwords", quiet=True)
nltk.download("wordnet", quiet=True)
lemmatizer = WordNetLemmatizer()
def clean(text, compress_whitespace=False, lowercase=False, clean_html=False, strip=False, remove_punctuation=False,
remove_stopwords_en=False, lemmatize=False):
if compress_whitespace and remove_stopwords_en:
raise ValueError("Redundant flags: remove_stopwords implies compress_whitespace")
if clean_html:
try:
root = etree.fromstring(text)
text = "".join(get_text(root))
except:
pass
if lowercase:
text = text.lower()
if compress_whitespace:
text = WHITESPACE_RE.sub(" ", text)
if strip:
text = text.strip()
if remove_punctuation:
text = PUNCTUATION_RE.sub("", text)
if remove_stopwords_en or lemmatize:
words = WHITESPACE_RE.split(text)
if lemmatize and remove_stopwords_en:
text = " ".join(lemmatizer.lemmatize(w) for w in words if w not in stop_words_en)
elif not lemmatize and remove_stopwords_en:
text = " ".join(w for w in words if w not in stop_words_en)
elif lemmatize and not remove_stopwords_en:
text = " ".join(lemmatizer.lemmatize(w) for w in words)
return text

View File

@ -2,7 +2,7 @@ from setuptools import setup
setup( setup(
name="hexlib", name="hexlib",
version="1.38", version="1.40",
description="Misc utility methods", description="Misc utility methods",
author="simon987", author="simon987",
author_email="me@simon987.net", author_email="me@simon987.net",
@ -13,6 +13,6 @@ setup(
]}, ]},
install_requires=[ install_requires=[
"ImageHash", "influxdb", "siphash", "python-dateutil", "redis", "orjson", "zstandard", "ImageHash", "influxdb", "siphash", "python-dateutil", "redis", "orjson", "zstandard",
"u-msgpack-python", "psycopg2-binary", "fake-useragent", "bs4" "u-msgpack-python", "psycopg2-binary", "fake-useragent", "bs4", "lxml", "nltk"
] ]
) )

105
test/test_text.py Normal file
View File

@ -0,0 +1,105 @@
from unittest import TestCase
from hexlib.text import clean
class TestText(TestCase):
def test_html_invalid(self):
text = ""
cleaned = clean(
text,
clean_html=True,
)
expected = ""
self.assertEqual(cleaned, expected)
def test_html_1(self):
text = "<div>Hello, <strong>world</strong></div>"
cleaned = clean(
text,
clean_html=True,
)
expected = "Hello, world"
self.assertEqual(cleaned, expected)
def test_html_2(self):
text = "<div>Hello, <strong>world</strong></div>"
cleaned = clean(
text,
clean_html=True,
lowercase=True
)
expected = "hello, world"
self.assertEqual(cleaned, expected)
def test_html_3(self):
text = "<div>\n Hello, \t\n<strong> world </strong>\n\t</div>"
cleaned = clean(
text,
clean_html=True,
lowercase=True,
compress_whitespace=True
)
expected = " hello, world "
self.assertEqual(cleaned, expected)
def test_html_4(self):
text = "<div>\n Hello, \t\n<strong> world </strong>\n\t</div>"
cleaned = clean(
text,
clean_html=True,
lowercase=True,
compress_whitespace=True,
strip=True
)
expected = "hello, world"
self.assertEqual(cleaned, expected)
def test_html_5(self):
text = "<div>\n Hello, \t\n<strong> world </strong>\n\t</div>"
cleaned = clean(
text,
clean_html=True,
lowercase=True,
compress_whitespace=True,
strip=True,
remove_punctuation=True
)
expected = "hello world"
self.assertEqual(cleaned, expected)
def test_html_6(self):
text = "<div>\n Hello, \t\n<strong>a the world </strong>\n\t</div>"
cleaned = clean(
text,
clean_html=True,
lowercase=True,
remove_punctuation=True,
strip=True,
remove_stopwords_en=True
)
expected = "hello world"
self.assertEqual(cleaned, expected)
def test_html_7(self):
text = "<div>\n Hello, \t\n<strong>a the worlds </strong>\n\t</div>"
cleaned = clean(
text,
clean_html=True,
lowercase=True,
remove_punctuation=True,
strip=True,
remove_stopwords_en=True,
lemmatize=True
)
expected = "hello world"
self.assertEqual(cleaned, expected)