Compare commits

..

No commits in common. "18cd59fc4a981849799c86a7e22750bb177cad9a" and "30902c82354276ef80afdd062fa7899d5426db92" have entirely different histories.

5 changed files with 4 additions and 180 deletions

View File

@ -1,8 +1,9 @@
import atexit
import os import os
import sys import sys
import time import time
from threading import Lock from threading import Lock
import atexit
from time import sleep from time import sleep
import siphash import siphash
@ -22,9 +23,7 @@ def retry(attempts, callback=None, retry_sleep=0):
callback(e) callback(e)
retries -= 1 retries -= 1
sleep(retry_sleep) sleep(retry_sleep)
return wrapper return wrapper
return decorate return decorate
@ -106,20 +105,4 @@ class CustomStdOut:
self.fp.close() self.fp.close()
class CustomStdErr:
original_stderr = sys.stderr
def __init__(self, fname):
self.fname = fname
def __enter__(self):
self.fp = open(self.fname, "w")
sys.stderr = self.fp
def __exit__(self, exc_type, exc_val, exc_tb):
sys.stdout = CustomStdErr.original_stderr
self.fp.close()
silent_stdout = CustomStdOut(os.devnull) silent_stdout = CustomStdOut(os.devnull)
silent_stderr = CustomStdErr(os.devnull)

View File

@ -2,5 +2,3 @@ import re
LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))") LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))")
HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"") HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"")
WHITESPACE_RE = re.compile(r"\s+")
PUNCTUATION_RE = re.compile(r"[.,;:\"']+")

View File

@ -1,52 +0,0 @@
import nltk.corpus
from lxml import etree
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from .regex import WHITESPACE_RE, PUNCTUATION_RE
get_text = etree.XPath("//text()")
stop_words_en = set(stopwords.words("english"))
nltk.download("stopwords", quiet=True)
nltk.download("wordnet", quiet=True)
lemmatizer = WordNetLemmatizer()
def clean(text, compress_whitespace=False, lowercase=False, clean_html=False, strip=False, remove_punctuation=False,
remove_stopwords_en=False, lemmatize=False):
if compress_whitespace and remove_stopwords_en:
raise ValueError("Redundant flags: remove_stopwords implies compress_whitespace")
if clean_html:
try:
root = etree.fromstring(text)
text = "".join(get_text(root))
except:
pass
if lowercase:
text = text.lower()
if compress_whitespace:
text = WHITESPACE_RE.sub(" ", text)
if strip:
text = text.strip()
if remove_punctuation:
text = PUNCTUATION_RE.sub("", text)
if remove_stopwords_en or lemmatize:
words = WHITESPACE_RE.split(text)
if lemmatize and remove_stopwords_en:
text = " ".join(lemmatizer.lemmatize(w) for w in words if w not in stop_words_en)
elif not lemmatize and remove_stopwords_en:
text = " ".join(w for w in words if w not in stop_words_en)
elif lemmatize and not remove_stopwords_en:
text = " ".join(lemmatizer.lemmatize(w) for w in words)
return text

View File

@ -2,7 +2,7 @@ from setuptools import setup
setup( setup(
name="hexlib", name="hexlib",
version="1.40", version="1.38",
description="Misc utility methods", description="Misc utility methods",
author="simon987", author="simon987",
author_email="me@simon987.net", author_email="me@simon987.net",
@ -13,6 +13,6 @@ setup(
]}, ]},
install_requires=[ install_requires=[
"ImageHash", "influxdb", "siphash", "python-dateutil", "redis", "orjson", "zstandard", "ImageHash", "influxdb", "siphash", "python-dateutil", "redis", "orjson", "zstandard",
"u-msgpack-python", "psycopg2-binary", "fake-useragent", "bs4", "lxml", "nltk" "u-msgpack-python", "psycopg2-binary", "fake-useragent", "bs4"
] ]
) )

View File

@ -1,105 +0,0 @@
from unittest import TestCase
from hexlib.text import clean
class TestText(TestCase):
def test_html_invalid(self):
text = ""
cleaned = clean(
text,
clean_html=True,
)
expected = ""
self.assertEqual(cleaned, expected)
def test_html_1(self):
text = "<div>Hello, <strong>world</strong></div>"
cleaned = clean(
text,
clean_html=True,
)
expected = "Hello, world"
self.assertEqual(cleaned, expected)
def test_html_2(self):
text = "<div>Hello, <strong>world</strong></div>"
cleaned = clean(
text,
clean_html=True,
lowercase=True
)
expected = "hello, world"
self.assertEqual(cleaned, expected)
def test_html_3(self):
text = "<div>\n Hello, \t\n<strong> world </strong>\n\t</div>"
cleaned = clean(
text,
clean_html=True,
lowercase=True,
compress_whitespace=True
)
expected = " hello, world "
self.assertEqual(cleaned, expected)
def test_html_4(self):
text = "<div>\n Hello, \t\n<strong> world </strong>\n\t</div>"
cleaned = clean(
text,
clean_html=True,
lowercase=True,
compress_whitespace=True,
strip=True
)
expected = "hello, world"
self.assertEqual(cleaned, expected)
def test_html_5(self):
text = "<div>\n Hello, \t\n<strong> world </strong>\n\t</div>"
cleaned = clean(
text,
clean_html=True,
lowercase=True,
compress_whitespace=True,
strip=True,
remove_punctuation=True
)
expected = "hello world"
self.assertEqual(cleaned, expected)
def test_html_6(self):
text = "<div>\n Hello, \t\n<strong>a the world </strong>\n\t</div>"
cleaned = clean(
text,
clean_html=True,
lowercase=True,
remove_punctuation=True,
strip=True,
remove_stopwords_en=True
)
expected = "hello world"
self.assertEqual(cleaned, expected)
def test_html_7(self):
text = "<div>\n Hello, \t\n<strong>a the worlds </strong>\n\t</div>"
cleaned = clean(
text,
clean_html=True,
lowercase=True,
remove_punctuation=True,
strip=True,
remove_stopwords_en=True,
lemmatize=True
)
expected = "hello world"
self.assertEqual(cleaned, expected)