Add option to use nltk word_tokenize

pin pydantic version
Update PgConn
2025-12-20 01:45:56 +00:00 · 2023-09-09 11:11:44 -04:00 · 2023-07-13 08:27:48 -04:00 · 2023-05-26 14:09:45 -04:00 · 2023-04-10 17:01:42 -04:00 · 2023-02-25 16:07:41 -05:00
10 changed files with 303 additions and 282 deletions
--- a/hexlib/concurrency.py
+++ b/hexlib/concurrency.py
@@ -34,7 +34,7 @@ class StatelessStreamWorker:
 class StatelessStreamProcessor:
-    def __init__(self, worker_factory, chunk_size=128, processes=1):
+    def __init__(self, worker_factory, chunk_size=128, processes=1, timeout=60):
        self._chunk_size = 128
        self._queue = MPQueue(maxsize=chunk_size)
        self._queue_out = MPQueue(maxsize=processes * 2)
@@ -42,6 +42,7 @@ class StatelessStreamProcessor:
        self._processes = []
        self._factory = worker_factory
        self._workers = []
        self._timeout = timeout
        if processes > 1:
            for _ in range(processes):
@@ -67,7 +68,7 @@ class StatelessStreamProcessor:
        ingest_thread = Thread(target=self._ingest, args=(iterable,))
        ingest_thread.start()
-        for results in queue_iter(self._queue_out, joinable=False, timeout=10):
+        for results in queue_iter(self._queue_out, joinable=False, timeout=self._timeout):
            yield from results
        ingest_thread.join()
--- a/hexlib/db.py
+++ b/hexlib/db.py
@@ -1,26 +1,24 @@
 import base64
 import sqlite3
 import traceback
 from datetime import datetime
 from enum import Enum
 import psycopg2
 import umsgpack
 from psycopg2.errorcodes import UNIQUE_VIOLATION
 from pydantic import BaseModel
 from hexlib.env import get_redis
-class PersistentState:
+def _json_encoder(x):
-    """Quick and dirty persistent dict-like SQLite wrapper"""
+    if isinstance(x, datetime):
        return x.isoformat()
    if isinstance(x, Enum):
        return x.value
-    def __init__(self, dbfile="state.db", logger=None, **dbargs):
+    raise Exception(f"I don't know how to JSON encode {x} ({type(x)})")
        self.dbfile = dbfile
        self.logger = logger
        if dbargs is None:
            dbargs = {"timeout": 30000}
        self.dbargs = dbargs
    def __getitem__(self, table):
        return Table(self, table)
 class VolatileState:
@@ -128,32 +126,54 @@ class Table:
        self._state = state
        self._table = table
-    def sql(self, where_clause, *params):
+    def _sql_dict(self, where_clause, *params):
        with sqlite3.connect(self._state.dbfile, **self._state.dbargs) as conn:
            conn.row_factory = sqlite3.Row
            try:
                cur = conn.execute("SELECT * FROM %s %s" % (self._table, where_clause), params)
                for row in cur:
                    yield dict(row)
            except:
                return None
    def __iter__(self):
        with sqlite3.connect(self._state.dbfile, **self._state.dbargs) as conn:
            conn.row_factory = sqlite3.Row
            try:
                cur = conn.execute("SELECT * FROM %s" % (self._table,))
                for row in cur:
                    yield dict(row)
            except:
                return None
    def __getitem__(self, item):
        with sqlite3.connect(self._state.dbfile, **self._state.dbargs) as conn:
            conn.row_factory = sqlite3.Row
            try:
                col_types = conn.execute("PRAGMA table_info(%s)" % self._table).fetchall()
-                cur = conn.execute("SELECT * FROM %s WHERE id=?" % (self._table,), (item,))
+                cur = conn.execute("SELECT * FROM %s %s" % (self._table, where_clause), params)
                for row in cur:
                    yield dict(
                        (col[0], _deserialize(row[col[0]], col_types[i]["type"]))
                        for i, col in enumerate(cur.description)
                    )
            except:
                return None
    def sql(self, where_clause, *params):
        for row in self._sql_dict(where_clause, *params):
            if row and "__pydantic" in row:
                yield self._deserialize_pydantic(row)
            else:
                yield row
    def _iter_dict(self):
        with sqlite3.connect(self._state.dbfile, **self._state.dbargs) as conn:
            conn.row_factory = sqlite3.Row
            try:
                col_types = conn.execute("PRAGMA table_info(%s)" % self._table).fetchall()
                cur = conn.execute("SELECT * FROM %s" % (self._table,))
                for row in cur:
                    yield dict(
                        (col[0], _deserialize(row[col[0]], col_types[i]["type"]))
                        for i, col in enumerate(cur.description)
                    )
            except:
                return None
    def __iter__(self):
        for row in self._iter_dict():
            if row and "__pydantic" in row:
                yield self._deserialize_pydantic(row)
            else:
                yield row
    def _getitem_dict(self, key):
        with sqlite3.connect(self._state.dbfile, **self._state.dbargs) as conn:
            conn.row_factory = sqlite3.Row
            try:
                col_types = conn.execute("PRAGMA table_info(%s)" % self._table).fetchall()
                cur = conn.execute("SELECT * FROM %s WHERE id=?" % (self._table,), (key,))
                row = cur.fetchone()
                if row:
@@ -164,8 +184,32 @@ class Table:
            except:
                return None
    @staticmethod
    def _deserialize_pydantic(row):
        module = __import__(row["__module"])
        cls = getattr(module, row["__class"])
        return cls.parse_raw(row["json"])
    def __getitem__(self, key):
        row = self._getitem_dict(key)
        if row and "__pydantic" in row:
            return self._deserialize_pydantic(row)
        return row
    def setitem_pydantic(self, key, value: BaseModel):
        self.__setitem__(key, {
            "json": value.json(encoder=_json_encoder, indent=2),
            "__class": value.__class__.__name__,
            "__module": value.__class__.__module__,
            "__pydantic": 1
        })
    def __setitem__(self, key, value):
        if isinstance(value, BaseModel):
            self.setitem_pydantic(key, value)
            return
        with sqlite3.connect(self._state.dbfile, **self._state.dbargs) as conn:
            conn.row_factory = sqlite3.Row
@@ -223,11 +267,33 @@ def _serialize(value):
 def _deserialize(value, col_type):
-    if col_type == "blob":
+    if col_type.lower() == "blob":
        return base64.b64decode(value)
    return value
 class PersistentState:
    """Quick and dirty persistent dict-like SQLite wrapper"""
    def __init__(self, dbfile="state.db", logger=None, table_factory=Table, **dbargs):
        self.dbfile = dbfile
        self.logger = logger
        if dbargs is None or dbargs == {}:
            dbargs = {"timeout": 30000}
        self.dbargs = dbargs
        self._table_factory = table_factory
    def __getitem__(self, table):
        return self._table_factory(self, table)
    def __delitem__(self, key):
        with sqlite3.connect(self.dbfile, **self.dbargs) as conn:
            try:
                conn.execute(f"DROP TABLE {key}")
            except:
                pass
 def pg_fetch_cursor_all(cur, name, batch_size=1000):
    while True:
        cur.execute("FETCH FORWARD %d FROM %s" % (batch_size, name))
@@ -250,10 +316,10 @@ class PgConn:
    def __init__(self, logger=None, **kwargs):
        self._conn_args = kwargs
        self.conn = psycopg2.connect(**kwargs)
        self.cur = self.conn.cursor()
        self._logger = logger
    def __enter__(self):
        self.cur = self.conn.cursor()
        return self
    def exec(self, query_string, args=None):
--- a/hexlib/files.py
+++ b/hexlib/files.py
@@ -85,7 +85,7 @@ def ndjson_iter(*files, compression=""):
                line_iter = BufferedReader(gzip.open(file))
        elif compression == COMPRESSION_ZSTD:
            fp = open(file, "rb")
-            dctx = zstandard.ZstdDecompressor()
+            dctx = zstandard.ZstdDecompressor(max_window_size=2147483648)
            reader = dctx.stream_reader(fp)
            line_iter = BufferedReader(reader)
--- a/hexlib/plot.py
+++ b/hexlib/plot.py
@@ -1,227 +0,0 @@
 import matplotlib.pyplot as plt
 import numpy as np
 from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
 DATA = [
    *["apple"] * 5,
    *["banana"] * 12,
    *["strawberry"] * 8,
    *["pineapple"] * 2,
 ]
 class Cmap:
    Accent = "Accent"
    Accent_r = "Accent_r"
    Blues = "Blues"
    Blues_r = "Blues_r"
    BrBG = "BrBG"
    BrBG_r = "BrBG_r"
    BuGn = "BuGn"
    BuGn_r = "BuGn_r"
    BuPu = "BuPu"
    BuPu_r = "BuPu_r"
    CMRmap = "CMRmap"
    CMRmap_r = "CMRmap_r"
    Dark2 = "Dark2"
    Dark2_r = "Dark2_r"
    GnBu = "GnBu"
    GnBu_r = "GnBu_r"
    Greens = "Greens"
    Greens_r = "Greens_r"
    Greys = "Greys"
    Greys_r = "Greys_r"
    OrRd = "OrRd"
    OrRd_r = "OrRd_r"
    Oranges = "Oranges"
    Oranges_r = "Oranges_r"
    PRGn = "PRGn"
    PRGn_r = "PRGn_r"
    Paired = "Paired"
    Paired_r = "Paired_r"
    Pastel1 = "Pastel1"
    Pastel1_r = "Pastel1_r"
    Pastel2 = "Pastel2"
    Pastel2_r = "Pastel2_r"
    PiYG = "PiYG"
    PiYG_r = "PiYG_r"
    PuBu = "PuBu"
    PuBuGn = "PuBuGn"
    PuBuGn_r = "PuBuGn_r"
    PuBu_r = "PuBu_r"
    PuOr = "PuOr"
    PuOr_r = "PuOr_r"
    PuRd = "PuRd"
    PuRd_r = "PuRd_r"
    Purples = "Purples"
    Purples_r = "Purples_r"
    RdBu = "RdBu"
    RdBu_r = "RdBu_r"
    RdGy = "RdGy"
    RdGy_r = "RdGy_r"
    RdPu = "RdPu"
    RdPu_r = "RdPu_r"
    RdYlBu = "RdYlBu"
    RdYlBu_r = "RdYlBu_r"
    RdYlGn = "RdYlGn"
    RdYlGn_r = "RdYlGn_r"
    Reds = "Reds"
    Reds_r = "Reds_r"
    Set1 = "Set1"
    Set1_r = "Set1_r"
    Set2 = "Set2"
    Set2_r = "Set2_r"
    Set3 = "Set3"
    Set3_r = "Set3_r"
    Spectral = "Spectral"
    Spectral_r = "Spectral_r"
    Wistia = "Wistia"
    Wistia_r = "Wistia_r"
    YlGn = "YlGn"
    YlGnBu = "YlGnBu"
    YlGnBu_r = "YlGnBu_r"
    YlGn_r = "YlGn_r"
    YlOrBr = "YlOrBr"
    YlOrBr_r = "YlOrBr_r"
    YlOrRd = "YlOrRd"
    YlOrRd_r = "YlOrRd_r"
    afmhot = "afmhot"
    afmhot_r = "afmhot_r"
    autumn = "autumn"
    autumn_r = "autumn_r"
    binary = "binary"
    binary_r = "binary_r"
    bone = "bone"
    bone_r = "bone_r"
    brg = "brg"
    brg_r = "brg_r"
    bwr = "bwr"
    bwr_r = "bwr_r"
    cividis = "cividis"
    cividis_r = "cividis_r"
    cool = "cool"
    cool_r = "cool_r"
    coolwarm = "coolwarm"
    coolwarm_r = "coolwarm_r"
    copper = "copper"
    copper_r = "copper_r"
    cubehelix = "cubehelix"
    cubehelix_r = "cubehelix_r"
    flag = "flag"
    flag_r = "flag_r"
    gist_earth = "gist_earth"
    gist_earth_r = "gist_earth_r"
    gist_gray = "gist_gray"
    gist_gray_r = "gist_gray_r"
    gist_heat = "gist_heat"
    gist_heat_r = "gist_heat_r"
    gist_ncar = "gist_ncar"
    gist_ncar_r = "gist_ncar_r"
    gist_rainbow = "gist_rainbow"
    gist_rainbow_r = "gist_rainbow_r"
    gist_stern = "gist_stern"
    gist_stern_r = "gist_stern_r"
    gist_yarg = "gist_yarg"
    gist_yarg_r = "gist_yarg_r"
    gnuplot = "gnuplot"
    gnuplot2 = "gnuplot2"
    gnuplot2_r = "gnuplot2_r"
    gnuplot_r = "gnuplot_r"
    gray = "gray"
    gray_r = "gray_r"
    hot = "hot"
    hot_r = "hot_r"
    hsv = "hsv"
    hsv_r = "hsv_r"
    inferno = "inferno"
    inferno_r = "inferno_r"
    jet = "jet"
    jet_r = "jet_r"
    magma = "magma"
    magma_r = "magma_r"
    nipy_spectral = "nipy_spectral"
    nipy_spectral_r = "nipy_spectral_r"
    ocean = "ocean"
    ocean_r = "ocean_r"
    pink = "pink"
    pink_r = "pink_r"
    plasma = "plasma"
    plasma_r = "plasma_r"
    prism = "prism"
    prism_r = "prism_r"
    rainbow = "rainbow"
    rainbow_r = "rainbow_r"
    seismic = "seismic"
    seismic_r = "seismic_r"
    spring = "spring"
    spring_r = "spring_r"
    summer = "summer"
    summer_r = "summer_r"
    tab10 = "tab10"
    tab10_r = "tab10_r"
    tab20 = "tab20"
    tab20_r = "tab20_r"
    tab20b = "tab20b"
    tab20b_r = "tab20b_r"
    tab20c = "tab20c"
    tab20c_r = "tab20c_r"
    terrain = "terrain"
    terrain_r = "terrain_r"
    turbo = "turbo"
    turbo_r = "turbo_r"
    twilight = "twilight"
    twilight_r = "twilight_r"
    twilight_shifted = "twilight_shifted"
    twilight_shifted_r = "twilight_shifted_r"
    viridis = "viridis"
    viridis_r = "viridis_r"
    winter = "winter"
    winter_r = "winter_r"
 def plot_freq_bar(items, ylabel="frequency", title=""):
    item_set, item_counts = np.unique(items, return_counts=True)
    plt.bar(item_set, item_counts)
    plt.xticks(rotation=35)
    plt.ylabel(ylabel)
    plt.title(title)
    for i, cnt in enumerate(item_counts):
        plt.text(x=i, y=cnt / 2, s=cnt, ha="center", color="white")
    plt.tight_layout()
 def plot_confusion_matrix(y_true=None, y_pred=None, cm=None, labels=None, title=None, cmap=None):
    if not cm:
        cm = confusion_matrix(y_true, y_pred, labels=labels)
    if type(cm) == list:
        cm = np.array(cm)
    cm_display = ConfusionMatrixDisplay(cm, display_labels=labels)
    cm_display.plot(cmap=cmap)
    if title:
        plt.title(title)
    if labels:
        plt.xticks(rotation=30)
    plt.tight_layout()
 if __name__ == '__main__':
    plot_freq_bar(DATA, title="My title")
    plt.show()
    plot_confusion_matrix(
        cm=[[12, 1, 0],
            [3, 14, 1],
            [5, 6, 7]],
        title="My title",
        labels=["apple", "orange", "grape"],
        cmap=Cmap.viridis
    )
    plt.show()
--- a/hexlib/regex_util.py
+++ b/hexlib/regex_util.py
--- a/hexlib/text.py
+++ b/hexlib/text.py
@@ -1,16 +1,20 @@
-from functools import partial
+import re
 from itertools import chain, repeat
 from multiprocessing.pool import Pool
 import nltk.corpus
 from lxml import etree
 from nltk import word_tokenize
 from nltk.corpus import stopwords
 from nltk.stem import WordNetLemmatizer
-from .regex import WHITESPACE_RE, PUNCTUATION_RE, LINK_RE, XML_ENTITY_RE
+from .regex_util import LINK_RE
 get_text = etree.XPath("//text()")
 nltk.download("stopwords", quiet=True)
 nltk.download("wordnet", quiet=True)
 nltk.download("punkt", quiet=True)
 stop_words_en = set(stopwords.words("english"))
 extra_stop_words_en = [
@@ -19,9 +23,6 @@ extra_stop_words_en = [
 stop_words_en.update(extra_stop_words_en)
 nltk.download("stopwords", quiet=True)
 nltk.download("wordnet", quiet=True)
 lemmatizer = WordNetLemmatizer()
@@ -53,13 +54,20 @@ SINGLE_QUOTE_TRANS = str.maketrans("".join(SINGLE_QUOTES), "".join(repeat("'", l
 DASHES = ("–", "⸺", "–", "—")
 DASHES_TRANS = str.maketrans("".join(DASHES), "".join(repeat("-", len(DASHES))))
-PUNCTUATION = ".,;:\"!?/()|*=>"
+DASHES_RE = re.compile(r"-+")
 SPECIAL_PUNCTUATION = ";:\"/()|*=>"
 SPECIAL_PUNCTUATION_TRANS = str.maketrans(SPECIAL_PUNCTUATION, " " * len(SPECIAL_PUNCTUATION))
 PUNCTUATION = ".,!?"
 PUNCTUATION_TRANS = str.maketrans(PUNCTUATION, " " * len(PUNCTUATION))
-def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False, remove_stopwords_en=False,
+def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False, remove_special_punctuation=False,
-               lemmatize=False, fix_single_quotes=False, strip_quotes=False, remove_urls=False, bigrams: set = None,
+               remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False,
-               trigrams: set = None, remove_numbers=False):
+               strip_dashes=False,
               remove_urls=False, bigrams: set = None, trigrams: set = None, remove_numbers=False,
               use_nltk_tokenizer=False):
    if lowercase:
        text = text.lower()
@@ -68,6 +76,9 @@ def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False
    text = text.translate(DASHES_TRANS)
    if strip_dashes:
        text = DASHES_RE.sub("-", text)
    if remove_urls:
        text = LINK_RE.sub(" ", text)
@@ -85,11 +96,20 @@ def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False
    if remove_punctuation:
        text = text.translate(PUNCTUATION_TRANS)
-    words = text.split()
+    if remove_special_punctuation:
        text = text.translate(SPECIAL_PUNCTUATION_TRANS)
    if use_nltk_tokenizer:
        words = word_tokenize(text, language="english")
    else:
        words = text.split()
    if strip_quotes:
        words = map(lambda w: w.strip("\"'“”"), words)
    if strip_dashes:
        words = map(lambda w: w.strip("-"), words)
    if bigrams:
        words = _transform_bigram(nltk.bigrams(chain(words, ("*",))), bigrams)
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@ from setuptools import setup
 setup(
    name="hexlib",
-    version="1.73",
+    version="1.89",
    description="Misc utility methods",
    author="simon987",
    author_email="me@simon987.net",
@@ -12,9 +12,9 @@ setup(
        "data/*"
    ]},
    install_requires=[
-        "ImageHash", "influxdb", "siphash", "python-dateutil", "redis", "orjson", "zstandard",
+        "influxdb", "siphash", "python-dateutil", "redis", "orjson", "zstandard",
        "u-msgpack-python", "psycopg2-binary", "bs4", "lxml", "nltk", "numpy",
-        "matplotlib", "scikit-learn", "fake-useragent @ git+https://github.com/Jordan9675/fake-useragent",
+        "matplotlib", "fake-useragent @ git+https://github.com/Jordan9675/fake-useragent",
-        "requests"
+        "requests", "pydantic==1.10.11"
    ]
 )
--- a/test/test_PersistentState.py
+++ b/test/test_PersistentState.py
@@ -110,3 +110,34 @@ class TestPersistentState(TestCase):
            del s["a"][456]
        except Exception as e:
            self.fail(e)
    def test_deserialize_get_set(self):
        s = PersistentState()
        s["a"][0] = {"x": b'abc'}
        self.assertEqual(s["a"][0]["x"], b'abc')
    def test_deserialize_sql(self):
        s = PersistentState()
        s["a"][0] = {"x": b'abc'}
        self.assertEqual(list(s["a"].sql("WHERE 1=1"))[0]["x"], b'abc')
    def test_deserialize_iter(self):
        s = PersistentState()
        s["a"][0] = {"x": b'abc'}
        self.assertEqual(list(s["a"])[0]["x"], b'abc')
    def test_drop_table(self):
        s = PersistentState()
        s["a"][0] = {"x": 1}
        s["a"][1] = {"x": 2}
        self.assertEqual(len(list(s["a"])), 2)
        del s["a"]
        self.assertEqual(len(list(s["a"])), 0)
--- a/test/test_PydanticTable.py
+++ b/test/test_PydanticTable.py
@@ -0,0 +1,110 @@
 import os
 from datetime import datetime
 from enum import Enum
 from typing import Optional
 from unittest import TestCase
 from pydantic import BaseModel
 from pydantic.types import List
 from hexlib.db import PersistentState
 class Status(Enum):
    yes = "yes"
    no = "no"
 class Point(BaseModel):
    x: int
    y: int
 class Polygon(BaseModel):
    points: List[Point] = []
    created_date: datetime
    status: Status = Status("yes")
 class TestPydanticTable(TestCase):
    def tearDown(self) -> None:
        if os.path.exists("state.db"):
            os.remove("state.db")
    def setUp(self) -> None:
        if os.path.exists("state.db"):
            os.remove("state.db")
    def test_get_set(self):
        s = PersistentState()
        val = Polygon(
            created_date=datetime(year=2000, day=1, month=1),
            points=[
                Point(x=1, y=2),
                Point(x=3, y=4),
            ],
        )
        s["a"]["1"] = val
        self.assertEqual(s["a"]["1"].points[0].x, 1)
        self.assertEqual(s["a"]["1"].status, Status("yes"))
        self.assertEqual(s["a"]["1"].points[1].x, 3)
        self.assertEqual(s["a"]["1"].created_date.year, 2000)
    def test_update(self):
        s = PersistentState()
        val = Polygon(
            created_date=datetime(year=2000, day=1, month=1),
            points=[
                Point(x=1, y=2),
                Point(x=3, y=4),
            ]
        )
        s["a"]["1"] = val
        self.assertEqual(s["a"]["1"].points[0].x, 1)
        val.points[0].x = 2
        s["a"]["1"] = val
        self.assertEqual(s["a"]["1"].points[0].x, 2)
    def test_sql(self):
        s = PersistentState()
        s["b"]["1"] = Polygon(
            created_date=datetime(year=2000, day=1, month=1),
            points=[]
        )
        s["b"]["2"] = Polygon(
            created_date=datetime(year=2010, day=1, month=1),
            points=[]
        )
        result = list(s["b"].sql(
            "WHERE json->>'created_date' LIKE '2000-%'"
        ))
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].created_date.year, 2000)
    def test_iterate(self):
        s = PersistentState()
        s["b"]["1"] = Polygon(
            created_date=datetime(year=2000, day=1, month=1),
            points=[]
        )
        s["b"]["2"] = Polygon(
            created_date=datetime(year=2010, day=1, month=1),
            points=[]
        )
        result = list(s["b"])
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].created_date.year, 2000)
        self.assertEqual(result[1].created_date.year, 2010)
--- a/test/test_text.py
+++ b/test/test_text.py
@@ -152,7 +152,7 @@ class TestText(TestCase):
            remove_stopwords_en=True,
            remove_urls=True
        )
-        expected = "hello world"
+        expected = "hello world |"
        self.assertEqual(" ".join(cleaned), expected)
@@ -170,7 +170,7 @@ class TestText(TestCase):
            remove_urls=False
        )
-        expected = "217709510 is there a servant that is against civilization and humanity literally instant summon"
+        expected = ">>217709510 is there a servant that is against civilization and humanity literally instant summon"
        self.assertEqual(" ".join(cleaned), expected)
    def test_html_entity(self):
@@ -257,3 +257,23 @@ class TestText(TestCase):
        expected = "hi test hello"
        self.assertEqual(" ".join(cleaned), expected)
    def test_strip_dashes(self):
        text = "yes -But something-something -- hello aa--bb"
        cleaned = preprocess(
            text,
            strip_dashes=True
        )
        expected = "yes But something-something hello aa-bb"
        self.assertEqual(" ".join(cleaned), expected)
    def test_word_tokenize(self):
        text = "i cannot believe'"
        cleaned = preprocess(
            text,
            use_nltk_tokenizer=True
        )
        expected = "i can not believe '"
        self.assertEqual(" ".join(cleaned), expected)
Author	SHA1	Message	Date
simon987	b1a1da3bac	Add option to use nltk word_tokenize	2023-09-09 11:11:44 -04:00
simon987	a047366926	pin pydantic version	2023-07-13 08:27:48 -04:00
simon987	24230cdc1e	Update PgConn	2023-05-26 14:09:45 -04:00
simon987	3bd9f03996	Fix PersistentState constructor	2023-04-10 17:01:42 -04:00
simon987	e267bbf1c8	Set json indent=2 for pydantic rows	2023-02-25 16:07:41 -05:00
simon987	42e33b72b2	Fix JSON encoding for Enums	2023-02-25 15:51:28 -05:00
simon987	5275c332cc	Add drop table	2023-02-25 15:38:40 -05:00
simon987	a7b1a6e1ec	Fix tests, add pydantic row support for PersistentState	2023-02-25 15:20:17 -05:00
simon987	826312115c	Fix deserialization in PersistentState again	2022-05-07 09:41:10 -04:00
simon987	372abb0076	Fix deserialization in PersistentState	2022-05-07 09:34:50 -04:00
simon987	78c04ef6f3	Add option to override Table factory in PersistentState	2022-05-05 15:02:48 -04:00
simon987	a51ad2cbb4	Cleanup	2022-05-03 10:59:25 -04:00
simon987	4befc3973d	Add strip_dashes option in preprocess()	2022-02-26 19:31:22 -05:00
simon987	c9fac7151a	Split punctuation into punctuation and special_punctuation	2022-02-23 11:01:17 -05:00
simon987	084acbe184	Set max_window_size=2147483648 for zstd	2022-01-29 10:44:38 -05:00
simon987	d578be3218	Increase timeout	2022-01-29 10:38:23 -05:00