Add StatfulStreamProcessor

This commit is contained in:
simon987 2021-09-19 12:39:57 -04:00
parent 7349c9a5f1
commit 71cd00c063
4 changed files with 71 additions and 4 deletions

View File

@ -1,9 +1,65 @@
from queue import Queue, Empty from queue import Queue, Empty
from multiprocessing import Process from multiprocessing import Process
from multiprocessing import Queue as MPQueue
from threading import Thread from threading import Thread
from hexlib.misc import ichunks
def queue_iter(q: Queue, **get_args):
class StatefulStreamWorker:
def __init__(self):
pass
def run(self, q: Queue):
for chunk in queue_iter(q, timeout=3):
self.process_chunk(chunk)
def process_chunk(self, chunk):
for item in chunk:
self.process(item)
def process(self, item) -> None:
raise NotImplementedError
class StatefulStreamProcessor:
def __init__(self, worker_factory, chunk_size=128, processes=1):
self._chunk_size = 128
self._queue = MPQueue(maxsize=chunk_size)
self._process_count = processes
self._processes = []
self._factory = worker_factory
self._workers = []
if processes > 1:
for _ in range(processes):
worker = self._factory()
p = Process(target=worker.run, args=(self._queue,))
p.start()
self._processes.append(p)
self._workers.append(worker)
else:
self._workers.append(self._factory())
def injest(self, iterable):
if self._process_count > 1:
for chunk in ichunks(iterable, self._chunk_size):
self._queue.put(chunk)
for p in self._processes:
p.join()
else:
for item in iterable:
self._workers[0].process(item)
def get_results(self):
for worker in self._workers:
yield worker.results()
def queue_iter(q: Queue, joinable=True, **get_args):
while True: while True:
try: try:
task = q.get(**get_args) task = q.get(**get_args)
@ -12,7 +68,8 @@ def queue_iter(q: Queue, **get_args):
break break
yield task yield task
q.task_done() if joinable:
q.task_done()
except Empty: except Empty:
break break
except KeyboardInterrupt: except KeyboardInterrupt:

View File

@ -1,4 +1,5 @@
import atexit import atexit
import itertools
import os import os
import sys import sys
import time import time
@ -33,6 +34,15 @@ def chunks(lst: list, chunk_len: int):
yield lst[i:i + chunk_len] yield lst[i:i + chunk_len]
def ichunks(iterable, chunk_len: int):
it = iter(iterable)
while True:
chunk = tuple(itertools.islice(it, chunk_len))
if not chunk:
break
yield chunk
def rate_limit(per_second): def rate_limit(per_second):
min_interval = 1.0 / float(per_second) min_interval = 1.0 / float(per_second)

View File

@ -60,7 +60,7 @@ SINGLE_QUOTES = ("", "`")
SINGLE_QUOTE_TRANS = str.maketrans("".join(SINGLE_QUOTES), "".join(repeat("'", len(SINGLE_QUOTES)))) SINGLE_QUOTE_TRANS = str.maketrans("".join(SINGLE_QUOTES), "".join(repeat("'", len(SINGLE_QUOTES))))
PUNCTUATION = ".,;:\"!?/()|*=>" PUNCTUATION = ".,;:\"!?/()|*=>"
PUNCTUATION_TRANS = str.maketrans(PUNCTUATION, len(PUNCTUATION)) PUNCTUATION_TRANS = str.maketrans(PUNCTUATION, " " * len(PUNCTUATION))
def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False, remove_stopwords_en=False, def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False, remove_stopwords_en=False,

View File

@ -2,7 +2,7 @@ from setuptools import setup
setup( setup(
name="hexlib", name="hexlib",
version="1.52", version="1.53",
description="Misc utility methods", description="Misc utility methods",
author="simon987", author="simon987",
author_email="me@simon987.net", author_email="me@simon987.net",