mirror of
				https://github.com/simon987/hexlib.git
				synced 2025-10-31 16:16:52 +00:00 
			
		
		
		
	Compare commits
	
		
			5 Commits
		
	
	
		
			8d8f9e8751
			...
			90d434ec73
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 90d434ec73 | |||
| 55fd4a66d2 | |||
| 3677815d57 | |||
| 1ce795a759 | |||
| e1537297d7 | 
| @ -56,9 +56,12 @@ def _transform_trigram(ngram_seq, ngrams): | ||||
|             yield ngram[0] | ||||
| 
 | ||||
| 
 | ||||
| SINGLE_QUOTES = ("’", "`") | ||||
| SINGLE_QUOTES = ("’", "`", "‘") | ||||
| SINGLE_QUOTE_TRANS = str.maketrans("".join(SINGLE_QUOTES), "".join(repeat("'", len(SINGLE_QUOTES)))) | ||||
| 
 | ||||
| DASHES = ("–", "⸺", "–", "—") | ||||
| DASHES_TRANS = str.maketrans("".join(DASHES), "".join(repeat("-", len(DASHES)))) | ||||
| 
 | ||||
| PUNCTUATION = ".,;:\"!?/()|*=>" | ||||
| PUNCTUATION_TRANS = str.maketrans(PUNCTUATION, " " * len(PUNCTUATION)) | ||||
| 
 | ||||
| @ -72,6 +75,8 @@ def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False | ||||
|     if fix_single_quotes: | ||||
|         text = text.translate(SINGLE_QUOTE_TRANS) | ||||
| 
 | ||||
|     text = text.translate(DASHES_TRANS) | ||||
| 
 | ||||
|     if remove_urls: | ||||
|         text = LINK_RE.sub(" ", text) | ||||
| 
 | ||||
| @ -92,7 +97,7 @@ def preprocess(text, lowercase=False, clean_html=False, remove_punctuation=False | ||||
|     words = text.split() | ||||
| 
 | ||||
|     if strip_quotes: | ||||
|         words = filter(lambda w: w.strip("\"'"), words) | ||||
|         words = map(lambda w: w.strip("\"'“”"), words) | ||||
| 
 | ||||
|     if bigrams: | ||||
|         words = _transform_bigram(nltk.bigrams(chain(words, ("*",))), bigrams) | ||||
|  | ||||
							
								
								
									
										2
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								setup.py
									
									
									
									
									
								
							| @ -2,7 +2,7 @@ from setuptools import setup | ||||
| 
 | ||||
| setup( | ||||
|     name="hexlib", | ||||
|     version="1.66", | ||||
|     version="1.70", | ||||
|     description="Misc utility methods", | ||||
|     author="simon987", | ||||
|     author_email="me@simon987.net", | ||||
|  | ||||
| @ -247,3 +247,13 @@ class TestText(TestCase): | ||||
|         expected = "hello1 test1124test world" | ||||
| 
 | ||||
|         self.assertEqual(" ".join(cleaned), expected) | ||||
| 
 | ||||
|     def test_strip_quotes(self): | ||||
|         text = "'hi' “test” 'hello\"" | ||||
|         cleaned = preprocess( | ||||
|             text, | ||||
|             strip_quotes=True | ||||
|         ) | ||||
|         expected = "hi test hello" | ||||
| 
 | ||||
|         self.assertEqual(" ".join(cleaned), expected) | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user