Compare commits

...

5 Commits

Author SHA1 Message Date
4c8b74bd8f Merge pull request #1 from simon987/dependabot/pip/pydantic-1.10.13
Bump pydantic from 1.10.11 to 1.10.13
2024-04-25 08:45:06 -04:00
dependabot[bot]
d82e1bccee Bump pydantic from 1.10.11 to 1.10.13
Bumps [pydantic](https://github.com/pydantic/pydantic) from 1.10.11 to 1.10.13.
- [Release notes](https://github.com/pydantic/pydantic/releases)
- [Changelog](https://github.com/pydantic/pydantic/blob/main/HISTORY.md)
- [Commits](https://github.com/pydantic/pydantic/compare/v1.10.11...v1.10.13)

---
updated-dependencies:
- dependency-name: pydantic
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-04-25 05:06:23 +00:00
b1a1da3bac Add option to use nltk word_tokenize 2023-09-09 11:11:44 -04:00
a047366926 pin pydantic version 2023-07-13 08:27:48 -04:00
24230cdc1e Update PgConn 2023-05-26 14:09:45 -04:00
4 changed files with 21 additions and 5 deletions

View File

@@ -316,10 +316,10 @@ class PgConn:
def __init__(self, logger=None, **kwargs):
self._conn_args = kwargs
self.conn = psycopg2.connect(**kwargs)
self.cur = self.conn.cursor()
self._logger = logger
def __enter__(self):
self.cur = self.conn.cursor()
return self
def exec(self, query_string, args=None):

View File

@@ -3,6 +3,7 @@ from itertools import chain, repeat
import nltk.corpus
from lxml import etree
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
@@ -12,6 +13,7 @@ get_text = etree.XPath("//text()")
nltk.download("stopwords", quiet=True)
nltk.download("wordnet", quiet=True)
nltk.download("punkt", quiet=True)
stop_words_en = set(stopwords.words("english"))
@@ -64,7 +66,8 @@ PUNCTUATION_TRANS = str.maketrans(PUNCTUATION, " " * len(PUNCTUATION))
def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False, remove_special_punctuation=False,
remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False,
strip_dashes=False,
remove_urls=False, bigrams: set = None, trigrams: set = None, remove_numbers=False):
remove_urls=False, bigrams: set = None, trigrams: set = None, remove_numbers=False,
use_nltk_tokenizer=False):
if lowercase:
text = text.lower()
@@ -96,7 +99,10 @@ def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False
if remove_special_punctuation:
text = text.translate(SPECIAL_PUNCTUATION_TRANS)
words = text.split()
if use_nltk_tokenizer:
words = word_tokenize(text, language="english")
else:
words = text.split()
if strip_quotes:
words = map(lambda w: w.strip("\"'“”"), words)

View File

@@ -2,7 +2,7 @@ from setuptools import setup
setup(
name="hexlib",
version="1.86",
version="1.89",
description="Misc utility methods",
author="simon987",
author_email="me@simon987.net",
@@ -15,6 +15,6 @@ setup(
"influxdb", "siphash", "python-dateutil", "redis", "orjson", "zstandard",
"u-msgpack-python", "psycopg2-binary", "bs4", "lxml", "nltk", "numpy",
"matplotlib", "fake-useragent @ git+https://github.com/Jordan9675/fake-useragent",
"requests"
"requests", "pydantic==1.10.13"
]
)

View File

@@ -267,3 +267,13 @@ class TestText(TestCase):
expected = "yes But something-something hello aa-bb"
self.assertEqual(" ".join(cleaned), expected)
def test_word_tokenize(self):
text = "i cannot believe'"
cleaned = preprocess(
text,
use_nltk_tokenizer=True
)
expected = "i can not believe '"
self.assertEqual(" ".join(cleaned), expected)