mirror of
https://github.com/simon987/hexlib.git
synced 2025-04-20 02:06:42 +00:00
improve text cleaning
This commit is contained in:
parent
00323ea576
commit
067a20f7a8
@ -1,5 +1,5 @@
|
|||||||
from functools import partial
|
from functools import partial
|
||||||
from multiprocessing.pool import ThreadPool
|
from multiprocessing.pool import Pool
|
||||||
|
|
||||||
import nltk.corpus
|
import nltk.corpus
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
@ -18,8 +18,8 @@ nltk.download("wordnet", quiet=True)
|
|||||||
lemmatizer = WordNetLemmatizer()
|
lemmatizer = WordNetLemmatizer()
|
||||||
|
|
||||||
|
|
||||||
def clean_multithread(texts, processes, **kwargs):
|
def clean_multicore(texts, processes, **kwargs):
|
||||||
pool = ThreadPool(processes=processes)
|
pool = Pool(processes=processes)
|
||||||
return pool.map(
|
return pool.map(
|
||||||
func=partial(clean, **kwargs),
|
func=partial(clean, **kwargs),
|
||||||
iterable=texts,
|
iterable=texts,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user