Merge pull request #1 from simon987/dependabot/pip/pydantic-1.10.13

Bump pydantic from 1.10.11 to 1.10.13
2025-12-20 09:45:56 +00:00 · 2024-04-25 08:45:06 -04:00 · 2024-04-25 05:06:23 +00:00 · 2023-09-09 11:11:44 -04:00 · 2023-07-13 08:27:48 -04:00 · 2023-05-26 14:09:45 -04:00
4 changed files with 21 additions and 5 deletions
--- a/hexlib/db.py
+++ b/hexlib/db.py
@@ -316,10 +316,10 @@ class PgConn:
    def __init__(self, logger=None, **kwargs):
        self._conn_args = kwargs
        self.conn = psycopg2.connect(**kwargs)
-        self.cur = self.conn.cursor()
        self._logger = logger

    def __enter__(self):
+        self.cur = self.conn.cursor()
        return self

    def exec(self, query_string, args=None):
--- a/hexlib/text.py
+++ b/hexlib/text.py
@@ -3,6 +3,7 @@ from itertools import chain, repeat

 import nltk.corpus
 from lxml import etree
+from nltk import word_tokenize
 from nltk.corpus import stopwords
 from nltk.stem import WordNetLemmatizer

@@ -12,6 +13,7 @@ get_text = etree.XPath("//text()")

 nltk.download("stopwords", quiet=True)
 nltk.download("wordnet", quiet=True)
+nltk.download("punkt", quiet=True)

 stop_words_en = set(stopwords.words("english"))

@@ -64,7 +66,8 @@ PUNCTUATION_TRANS = str.maketrans(PUNCTUATION, " " * len(PUNCTUATION))
 def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False, remove_special_punctuation=False,
               remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False,
               strip_dashes=False,
-               remove_urls=False, bigrams: set = None, trigrams: set = None, remove_numbers=False):
+               remove_urls=False, bigrams: set = None, trigrams: set = None, remove_numbers=False,
+               use_nltk_tokenizer=False):
    if lowercase:
        text = text.lower()

@@ -96,7 +99,10 @@ def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False
    if remove_special_punctuation:
        text = text.translate(SPECIAL_PUNCTUATION_TRANS)

-    words = text.split()
+    if use_nltk_tokenizer:
+        words = word_tokenize(text, language="english")
+    else:
+        words = text.split()

    if strip_quotes:
        words = map(lambda w: w.strip("\"'“”"), words)
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@ from setuptools import setup

 setup(
    name="hexlib",
-    version="1.86",
+    version="1.89",
    description="Misc utility methods",
    author="simon987",
    author_email="me@simon987.net",
@@ -15,6 +15,6 @@ setup(
        "influxdb", "siphash", "python-dateutil", "redis", "orjson", "zstandard",
        "u-msgpack-python", "psycopg2-binary", "bs4", "lxml", "nltk", "numpy",
        "matplotlib", "fake-useragent @ git+https://github.com/Jordan9675/fake-useragent",
-        "requests"
+        "requests", "pydantic==1.10.13"
    ]
 )
--- a/test/test_text.py
+++ b/test/test_text.py
@@ -267,3 +267,13 @@ class TestText(TestCase):
        expected = "yes But something-something hello aa-bb"

        self.assertEqual(" ".join(cleaned), expected)
+
+    def test_word_tokenize(self):
+        text = "i cannot believe'"
+        cleaned = preprocess(
+            text,
+            use_nltk_tokenizer=True
+        )
+        expected = "i can not believe '"
+
+        self.assertEqual(" ".join(cleaned), expected)
Author	SHA1	Message	Date
simon987	4c8b74bd8f	Merge pull request #1 from simon987/dependabot/pip/pydantic-1.10.13 Bump pydantic from 1.10.11 to 1.10.13	2024-04-25 08:45:06 -04:00
dependabot[bot]	d82e1bccee	Bump pydantic from 1.10.11 to 1.10.13 Bumps [pydantic](https://github.com/pydantic/pydantic) from 1.10.11 to 1.10.13. - [Release notes](https://github.com/pydantic/pydantic/releases) - [Changelog](https://github.com/pydantic/pydantic/blob/main/HISTORY.md) - [Commits](https://github.com/pydantic/pydantic/compare/v1.10.11...v1.10.13) --- updated-dependencies: - dependency-name: pydantic dependency-type: direct:production ... Signed-off-by: dependabot[bot] <support@github.com>	2024-04-25 05:06:23 +00:00
simon987	b1a1da3bac	Add option to use nltk word_tokenize	2023-09-09 11:11:44 -04:00
simon987	a047366926	pin pydantic version	2023-07-13 08:27:48 -04:00
simon987	24230cdc1e	Update PgConn	2023-05-26 14:09:45 -04:00