mirror of
https://github.com/simon987/hexlib.git
synced 2025-04-10 14:06:43 +00:00
improve text cleaning
This commit is contained in:
parent
00323ea576
commit
067a20f7a8
@ -1,5 +1,5 @@
|
||||
from functools import partial
|
||||
from multiprocessing.pool import ThreadPool
|
||||
from multiprocessing.pool import Pool
|
||||
|
||||
import nltk.corpus
|
||||
from lxml import etree
|
||||
@ -18,8 +18,8 @@ nltk.download("wordnet", quiet=True)
|
||||
lemmatizer = WordNetLemmatizer()
|
||||
|
||||
|
||||
def clean_multithread(texts, processes, **kwargs):
|
||||
pool = ThreadPool(processes=processes)
|
||||
def clean_multicore(texts, processes, **kwargs):
|
||||
pool = Pool(processes=processes)
|
||||
return pool.map(
|
||||
func=partial(clean, **kwargs),
|
||||
iterable=texts,
|
||||
|
Loading…
x
Reference in New Issue
Block a user