improve text cleaning

This commit is contained in:
simon987 2021-04-18 20:32:34 -04:00
parent 00323ea576
commit 067a20f7a8

View File

@ -1,5 +1,5 @@
from functools import partial from functools import partial
from multiprocessing.pool import ThreadPool from multiprocessing.pool import Pool
import nltk.corpus import nltk.corpus
from lxml import etree from lxml import etree
@ -18,8 +18,8 @@ nltk.download("wordnet", quiet=True)
lemmatizer = WordNetLemmatizer() lemmatizer = WordNetLemmatizer()
def clean_multithread(texts, processes, **kwargs): def clean_multicore(texts, processes, **kwargs):
pool = ThreadPool(processes=processes) pool = Pool(processes=processes)
return pool.map( return pool.map(
func=partial(clean, **kwargs), func=partial(clean, **kwargs),
iterable=texts, iterable=texts,