mirror of
				https://github.com/simon987/hexlib.git
				synced 2025-10-31 08:16:51 +00:00 
			
		
		
		
	Compare commits
	
		
			6 Commits
		
	
	
		
			db3e191983
			...
			60273fb6bd
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 60273fb6bd | |||
| 67c09cc10c | |||
| a7bf5b2d15 | |||
| 31b35e3a32 | |||
| 4cff343370 | |||
| 4d6c8018df | 
| @ -3,4 +3,5 @@ import re | |||||||
| LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))") | LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))") | ||||||
| HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"") | HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"") | ||||||
| WHITESPACE_RE = re.compile(r"\s+") | WHITESPACE_RE = re.compile(r"\s+") | ||||||
| PUNCTUATION_RE = re.compile(r"[.,;:\"!?/()|*=]+") | PUNCTUATION_RE = re.compile(r"[.,;:\"!?/()|*=>]+") | ||||||
|  | XML_ENTITY_RE = re.compile(r"&[a-z]+;") | ||||||
|  | |||||||
| @ -6,7 +6,7 @@ from lxml import etree | |||||||
| from nltk.corpus import stopwords | from nltk.corpus import stopwords | ||||||
| from nltk.stem import WordNetLemmatizer | from nltk.stem import WordNetLemmatizer | ||||||
| 
 | 
 | ||||||
| from .regex import WHITESPACE_RE, PUNCTUATION_RE, LINK_RE | from .regex import WHITESPACE_RE, PUNCTUATION_RE, LINK_RE, XML_ENTITY_RE | ||||||
| 
 | 
 | ||||||
| get_text = etree.XPath("//text()") | get_text = etree.XPath("//text()") | ||||||
| 
 | 
 | ||||||
| @ -24,11 +24,12 @@ nltk.download("wordnet", quiet=True) | |||||||
| lemmatizer = WordNetLemmatizer() | lemmatizer = WordNetLemmatizer() | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def clean_multicore(texts, processes, **kwargs): | def clean_multicore(texts, processes, chunk_size=10000, **kwargs): | ||||||
|     pool = Pool(processes=processes) |     pool = Pool(processes=processes) | ||||||
|     return pool.map( |     yield from pool.imap( | ||||||
|         func=partial(preprocess, **kwargs), |         func=partial(preprocess, **kwargs), | ||||||
|         iterable=texts, |         iterable=texts, | ||||||
|  |         chunksize=chunk_size | ||||||
|     ) |     ) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @ -44,7 +45,7 @@ def _transform_bigram(ngram_seq, ngrams): | |||||||
| 
 | 
 | ||||||
| def preprocess(text, lowercase=False, clean_html=False, strip=False, remove_punctuation=False, | def preprocess(text, lowercase=False, clean_html=False, strip=False, remove_punctuation=False, | ||||||
|                remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False, |                remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False, | ||||||
|                remove_urls=False, bigrams: set = None): |                remove_urls=False, bigrams: set = None, remove_numbers=False): | ||||||
|     if lowercase: |     if lowercase: | ||||||
|         text = text.lower() |         text = text.lower() | ||||||
| 
 | 
 | ||||||
| @ -56,8 +57,12 @@ def preprocess(text, lowercase=False, clean_html=False, strip=False, remove_punc | |||||||
| 
 | 
 | ||||||
|     if clean_html: |     if clean_html: | ||||||
|         try: |         try: | ||||||
|             root = etree.fromstring(text.replace("&", "")) |             text = "<root>" + text + "</root>" | ||||||
|             text = "".join(get_text(root)) | 
 | ||||||
|  |             parser = etree.XMLParser(recover=True) | ||||||
|  |             root = etree.fromstring(text, parser) | ||||||
|  | 
 | ||||||
|  |             text = " ".join(get_text(root)) | ||||||
|         except: |         except: | ||||||
|             pass |             pass | ||||||
| 
 | 
 | ||||||
| @ -75,9 +80,14 @@ def preprocess(text, lowercase=False, clean_html=False, strip=False, remove_punc | |||||||
|         words.append("*") |         words.append("*") | ||||||
|         text = " ".join(_transform_bigram(nltk.bigrams(words), bigrams)) |         text = " ".join(_transform_bigram(nltk.bigrams(words), bigrams)) | ||||||
| 
 | 
 | ||||||
|     if remove_stopwords_en or lemmatize: |     if remove_stopwords_en or lemmatize or remove_numbers: | ||||||
|         words = text.split(" ") |         words = text.split(" ") | ||||||
| 
 | 
 | ||||||
|  |         if remove_numbers: | ||||||
|  |             words = filter(lambda w: not w.isnumeric(), words) | ||||||
|  | 
 | ||||||
|  |         if not lemmatize and not remove_stopwords_en: | ||||||
|  |             text = " ".join(words) | ||||||
|         if lemmatize and remove_stopwords_en: |         if lemmatize and remove_stopwords_en: | ||||||
|             text = " ".join(lemmatizer.lemmatize(w) for w in words if w not in stop_words_en) |             text = " ".join(lemmatizer.lemmatize(w) for w in words if w not in stop_words_en) | ||||||
|         elif not lemmatize and remove_stopwords_en: |         elif not lemmatize and remove_stopwords_en: | ||||||
|  | |||||||
							
								
								
									
										2
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								setup.py
									
									
									
									
									
								
							| @ -2,7 +2,7 @@ from setuptools import setup | |||||||
| 
 | 
 | ||||||
| setup( | setup( | ||||||
|     name="hexlib", |     name="hexlib", | ||||||
|     version="1.44", |     version="1.48", | ||||||
|     description="Misc utility methods", |     description="Misc utility methods", | ||||||
|     author="simon987", |     author="simon987", | ||||||
|     author_email="me@simon987.net", |     author_email="me@simon987.net", | ||||||
|  | |||||||
| @ -164,6 +164,61 @@ class TestText(TestCase): | |||||||
| 
 | 
 | ||||||
|         self.assertEqual(cleaned, expected) |         self.assertEqual(cleaned, expected) | ||||||
| 
 | 
 | ||||||
|  |     def test_html_no_root(self): | ||||||
|  |         text = "<a href=\"#p217709510\" class=\"quotelink\">>>217709510</a><br>Is there a<wbr>servant that is against civilization and humanity?<br>Literally instant summon." | ||||||
|  | 
 | ||||||
|  |         cleaned = preprocess( | ||||||
|  |             text, | ||||||
|  |             clean_html=True, | ||||||
|  |             lowercase=True, | ||||||
|  |             remove_punctuation=True, | ||||||
|  |             strip=True, | ||||||
|  |             lemmatize=False, | ||||||
|  |             fix_single_quotes=True, | ||||||
|  |             remove_stopwords_en=False, | ||||||
|  |             remove_urls=False | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         expected = "217709510 is there a servant that is against civilization and humanity literally instant summon" | ||||||
|  |         self.assertEqual(cleaned, expected) | ||||||
|  | 
 | ||||||
|  |     def test_html_entity(self): | ||||||
|  |         text = "doesn't" | ||||||
|  | 
 | ||||||
|  |         cleaned = preprocess( | ||||||
|  |             text, | ||||||
|  |             clean_html=True, | ||||||
|  |             lowercase=True, | ||||||
|  |             remove_punctuation=True, | ||||||
|  |             strip=True, | ||||||
|  |             lemmatize=False, | ||||||
|  |             fix_single_quotes=True, | ||||||
|  |             remove_stopwords_en=False, | ||||||
|  |             remove_urls=False | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         expected = "doesn't" | ||||||
|  |         self.assertEqual(cleaned, expected) | ||||||
|  | 
 | ||||||
|  |     def test_html_invalid_attribute(self): | ||||||
|  |         text = '<root><iframe width="560" height="315" src=" " title="youtube video player" frameborder="0" allowfullscreen></iframe></root>' | ||||||
|  | 
 | ||||||
|  |         cleaned = preprocess( | ||||||
|  |             text, | ||||||
|  |             clean_html=True, | ||||||
|  |             lowercase=True, | ||||||
|  |             remove_punctuation=True, | ||||||
|  |             strip=True, | ||||||
|  |             lemmatize=False, | ||||||
|  |             fix_single_quotes=True, | ||||||
|  |             remove_stopwords_en=False, | ||||||
|  |             remove_urls=False | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         expected = "" | ||||||
|  | 
 | ||||||
|  |         self.assertEqual(cleaned, expected) | ||||||
|  | 
 | ||||||
|     def test_bigrams(self): |     def test_bigrams(self): | ||||||
|         text = "x A b c d e f g h" |         text = "x A b c d e f g h" | ||||||
|         cleaned = preprocess( |         cleaned = preprocess( | ||||||
| @ -178,3 +233,14 @@ class TestText(TestCase): | |||||||
|         expected = "x a_b c_d e f_g h" |         expected = "x a_b c_d e f_g h" | ||||||
| 
 | 
 | ||||||
|         self.assertEqual(cleaned, expected) |         self.assertEqual(cleaned, expected) | ||||||
|  | 
 | ||||||
|  |     def test_remove_numbers(self): | ||||||
|  |         text = "Hello1 test1124test 12 1 1111111 world" | ||||||
|  |         cleaned = preprocess( | ||||||
|  |             text, | ||||||
|  |             lowercase=True, | ||||||
|  |             remove_numbers=True | ||||||
|  |         ) | ||||||
|  |         expected = "hello1 test1124test world" | ||||||
|  | 
 | ||||||
|  |         self.assertEqual(cleaned, expected) | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user