mirror of
https://github.com/simon987/hexlib.git
synced 2025-04-10 06:06:41 +00:00
Rename test.clean to text.preprocess, add QS util func, more debug logging
This commit is contained in:
parent
9002ae7506
commit
75bf2c2d85
@ -53,6 +53,8 @@ def get_web(session=None):
|
|||||||
stdout_logger.debug("Web>cipherSuite=%s" % web._session.cipherSuite)
|
stdout_logger.debug("Web>cipherSuite=%s" % web._session.cipherSuite)
|
||||||
if hasattr(web._session, "headers"):
|
if hasattr(web._session, "headers"):
|
||||||
stdout_logger.debug("Web>headers=%s" % web._session.headers)
|
stdout_logger.debug("Web>headers=%s" % web._session.headers)
|
||||||
|
if hasattr(web._session, "cookies"):
|
||||||
|
stdout_logger.debug("Web>cookies=%s" % web._session.cookies)
|
||||||
|
|
||||||
stdout_logger.debug("Web>rps=%s" % os.environ.get("RPS", 1))
|
stdout_logger.debug("Web>rps=%s" % os.environ.get("RPS", 1))
|
||||||
|
|
||||||
|
@ -27,7 +27,7 @@ lemmatizer = WordNetLemmatizer()
|
|||||||
def clean_multicore(texts, processes, **kwargs):
|
def clean_multicore(texts, processes, **kwargs):
|
||||||
pool = Pool(processes=processes)
|
pool = Pool(processes=processes)
|
||||||
return pool.map(
|
return pool.map(
|
||||||
func=partial(clean, **kwargs),
|
func=partial(preprocess, **kwargs),
|
||||||
iterable=texts,
|
iterable=texts,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -42,9 +42,9 @@ def _transform_bigram(ngram_seq, ngrams):
|
|||||||
yield ngram[0]
|
yield ngram[0]
|
||||||
|
|
||||||
|
|
||||||
def clean(text, lowercase=False, clean_html=False, strip=False, remove_punctuation=False,
|
def preprocess(text, lowercase=False, clean_html=False, strip=False, remove_punctuation=False,
|
||||||
remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False,
|
remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False,
|
||||||
remove_urls=False, bigrams: set = None):
|
remove_urls=False, bigrams: set = None):
|
||||||
if lowercase:
|
if lowercase:
|
||||||
text = text.lower()
|
text = text.lower()
|
||||||
|
|
||||||
@ -64,20 +64,19 @@ def clean(text, lowercase=False, clean_html=False, strip=False, remove_punctuati
|
|||||||
if remove_punctuation:
|
if remove_punctuation:
|
||||||
text = PUNCTUATION_RE.sub(" ", text)
|
text = PUNCTUATION_RE.sub(" ", text)
|
||||||
|
|
||||||
if not remove_stopwords_en or not lemmatize or not strip_quotes:
|
text = WHITESPACE_RE.sub(" ", text)
|
||||||
text = WHITESPACE_RE.sub(" ", text)
|
|
||||||
|
|
||||||
if strip_quotes:
|
if strip_quotes:
|
||||||
words = WHITESPACE_RE.split(text)
|
words = text.split(" ")
|
||||||
text = " ".join(w.strip("\"'") for w in words)
|
text = " ".join(w.strip("\"'") for w in words)
|
||||||
|
|
||||||
if bigrams:
|
if bigrams:
|
||||||
words = WHITESPACE_RE.split(text)
|
words = text.split(" ")
|
||||||
words.append("*")
|
words.append("*")
|
||||||
text = " ".join(_transform_bigram(nltk.bigrams(words), bigrams))
|
text = " ".join(_transform_bigram(nltk.bigrams(words), bigrams))
|
||||||
|
|
||||||
if remove_stopwords_en or lemmatize:
|
if remove_stopwords_en or lemmatize:
|
||||||
words = WHITESPACE_RE.split(text)
|
words = text.split(" ")
|
||||||
|
|
||||||
if lemmatize and remove_stopwords_en:
|
if lemmatize and remove_stopwords_en:
|
||||||
text = " ".join(lemmatizer.lemmatize(w) for w in words if w not in stop_words_en)
|
text = " ".join(lemmatizer.lemmatize(w) for w in words if w not in stop_words_en)
|
||||||
|
@ -5,6 +5,8 @@ from datetime import datetime
|
|||||||
from base64 import b64encode, b64decode
|
from base64 import b64encode, b64decode
|
||||||
from http.cookiejar import Cookie
|
from http.cookiejar import Cookie
|
||||||
from time import time
|
from time import time
|
||||||
|
from urllib.parse import urlparse, parse_qs
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
@ -79,6 +81,18 @@ def cookiejar_filter(cj, pattern):
|
|||||||
return filtered_cj
|
return filtered_cj
|
||||||
|
|
||||||
|
|
||||||
|
def url_query_value(url, arg, as_list=False):
|
||||||
|
qs = urlparse(url).query
|
||||||
|
parsed_qs = parse_qs(qs)
|
||||||
|
|
||||||
|
arg = parsed_qs.get(arg, [])
|
||||||
|
|
||||||
|
if as_list:
|
||||||
|
return arg if arg else []
|
||||||
|
else:
|
||||||
|
return arg[0] if arg else None
|
||||||
|
|
||||||
|
|
||||||
def download_file(url, destination, session=None, headers=None, overwrite=False, retries=1, err_cb=None,
|
def download_file(url, destination, session=None, headers=None, overwrite=False, retries=1, err_cb=None,
|
||||||
save_meta=False):
|
save_meta=False):
|
||||||
if os.path.exists(destination) and not overwrite:
|
if os.path.exists(destination) and not overwrite:
|
||||||
@ -112,7 +126,8 @@ def download_file(url, destination, session=None, headers=None, overwrite=False,
|
|||||||
|
|
||||||
|
|
||||||
class Web:
|
class Web:
|
||||||
def __init__(self, proxy=None, rps=1, retries=3, retry_sleep=0, logger=None, cookie_file=None, retry_codes=None, session=None,
|
def __init__(self, proxy=None, rps=1, retries=3, retry_sleep=0, logger=None, cookie_file=None, retry_codes=None,
|
||||||
|
session=None,
|
||||||
ua=None):
|
ua=None):
|
||||||
self._cookie_file = cookie_file
|
self._cookie_file = cookie_file
|
||||||
self._proxy = proxy
|
self._proxy = proxy
|
||||||
|
2
setup.py
2
setup.py
@ -2,7 +2,7 @@ from setuptools import setup
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="hexlib",
|
name="hexlib",
|
||||||
version="1.41",
|
version="1.42",
|
||||||
description="Misc utility methods",
|
description="Misc utility methods",
|
||||||
author="simon987",
|
author="simon987",
|
||||||
author_email="me@simon987.net",
|
author_email="me@simon987.net",
|
||||||
|
@ -1,13 +1,13 @@
|
|||||||
from unittest import TestCase
|
from unittest import TestCase
|
||||||
|
|
||||||
from hexlib.text import clean
|
from hexlib.text import preprocess
|
||||||
|
|
||||||
|
|
||||||
class TestText(TestCase):
|
class TestText(TestCase):
|
||||||
|
|
||||||
def test_html_invalid(self):
|
def test_html_invalid(self):
|
||||||
text = ""
|
text = ""
|
||||||
cleaned = clean(
|
cleaned = preprocess(
|
||||||
text,
|
text,
|
||||||
clean_html=True,
|
clean_html=True,
|
||||||
)
|
)
|
||||||
@ -17,7 +17,7 @@ class TestText(TestCase):
|
|||||||
|
|
||||||
def test_html_1(self):
|
def test_html_1(self):
|
||||||
text = "<div>Hello, <strong>world</strong></div>"
|
text = "<div>Hello, <strong>world</strong></div>"
|
||||||
cleaned = clean(
|
cleaned = preprocess(
|
||||||
text,
|
text,
|
||||||
clean_html=True,
|
clean_html=True,
|
||||||
)
|
)
|
||||||
@ -27,7 +27,7 @@ class TestText(TestCase):
|
|||||||
|
|
||||||
def test_html_2(self):
|
def test_html_2(self):
|
||||||
text = "<div>Hello, <strong>world</strong></div>"
|
text = "<div>Hello, <strong>world</strong></div>"
|
||||||
cleaned = clean(
|
cleaned = preprocess(
|
||||||
text,
|
text,
|
||||||
clean_html=True,
|
clean_html=True,
|
||||||
lowercase=True
|
lowercase=True
|
||||||
@ -38,7 +38,7 @@ class TestText(TestCase):
|
|||||||
|
|
||||||
def test_html_3(self):
|
def test_html_3(self):
|
||||||
text = "<div>\n Hello, \t\n<strong> world </strong>\n\t</div>"
|
text = "<div>\n Hello, \t\n<strong> world </strong>\n\t</div>"
|
||||||
cleaned = clean(
|
cleaned = preprocess(
|
||||||
text,
|
text,
|
||||||
clean_html=True,
|
clean_html=True,
|
||||||
lowercase=True,
|
lowercase=True,
|
||||||
@ -49,7 +49,7 @@ class TestText(TestCase):
|
|||||||
|
|
||||||
def test_html_4(self):
|
def test_html_4(self):
|
||||||
text = "<div>\n Hello, \t\n<strong> world </strong>\n\t</div>"
|
text = "<div>\n Hello, \t\n<strong> world </strong>\n\t</div>"
|
||||||
cleaned = clean(
|
cleaned = preprocess(
|
||||||
text,
|
text,
|
||||||
clean_html=True,
|
clean_html=True,
|
||||||
lowercase=True,
|
lowercase=True,
|
||||||
@ -61,7 +61,7 @@ class TestText(TestCase):
|
|||||||
|
|
||||||
def test_html_5(self):
|
def test_html_5(self):
|
||||||
text = "<div>\n Hello, \t\n<strong> world </strong>\n\t</div>"
|
text = "<div>\n Hello, \t\n<strong> world </strong>\n\t</div>"
|
||||||
cleaned = clean(
|
cleaned = preprocess(
|
||||||
text,
|
text,
|
||||||
clean_html=True,
|
clean_html=True,
|
||||||
lowercase=True,
|
lowercase=True,
|
||||||
@ -74,7 +74,7 @@ class TestText(TestCase):
|
|||||||
|
|
||||||
def test_html_6(self):
|
def test_html_6(self):
|
||||||
text = "<div>\n Hello, \t\n<strong>a the world </strong>\n\t</div>"
|
text = "<div>\n Hello, \t\n<strong>a the world </strong>\n\t</div>"
|
||||||
cleaned = clean(
|
cleaned = preprocess(
|
||||||
text,
|
text,
|
||||||
clean_html=True,
|
clean_html=True,
|
||||||
lowercase=True,
|
lowercase=True,
|
||||||
@ -88,7 +88,7 @@ class TestText(TestCase):
|
|||||||
|
|
||||||
def test_html_7(self):
|
def test_html_7(self):
|
||||||
text = "<div>\n Hello, \t\n<strong>a the worlds </strong>\n\t</div>"
|
text = "<div>\n Hello, \t\n<strong>a the worlds </strong>\n\t</div>"
|
||||||
cleaned = clean(
|
cleaned = preprocess(
|
||||||
text,
|
text,
|
||||||
clean_html=True,
|
clean_html=True,
|
||||||
lowercase=True,
|
lowercase=True,
|
||||||
@ -103,7 +103,7 @@ class TestText(TestCase):
|
|||||||
|
|
||||||
def test_html_8(self):
|
def test_html_8(self):
|
||||||
text = "<div>\n Hello, \t\n<strong>a the worlds! </strong>\n\t</div>"
|
text = "<div>\n Hello, \t\n<strong>a the worlds! </strong>\n\t</div>"
|
||||||
cleaned = clean(
|
cleaned = preprocess(
|
||||||
text,
|
text,
|
||||||
clean_html=True,
|
clean_html=True,
|
||||||
lowercase=True,
|
lowercase=True,
|
||||||
@ -118,7 +118,7 @@ class TestText(TestCase):
|
|||||||
|
|
||||||
def test_html_9(self):
|
def test_html_9(self):
|
||||||
text = "<div>\n Hello, \t\n<strong>world! it's it`s </strong>\n\t</div>"
|
text = "<div>\n Hello, \t\n<strong>world! it's it`s </strong>\n\t</div>"
|
||||||
cleaned = clean(
|
cleaned = preprocess(
|
||||||
text,
|
text,
|
||||||
clean_html=True,
|
clean_html=True,
|
||||||
lowercase=True,
|
lowercase=True,
|
||||||
@ -133,7 +133,7 @@ class TestText(TestCase):
|
|||||||
|
|
||||||
def test_html_10(self):
|
def test_html_10(self):
|
||||||
text = "<div>\n Hello, \t\n<strong>world! it's it`s https://google.ca/test/abc.pdf </strong>\n\t</div>"
|
text = "<div>\n Hello, \t\n<strong>world! it's it`s https://google.ca/test/abc.pdf </strong>\n\t</div>"
|
||||||
cleaned = clean(
|
cleaned = preprocess(
|
||||||
text,
|
text,
|
||||||
clean_html=True,
|
clean_html=True,
|
||||||
lowercase=True,
|
lowercase=True,
|
||||||
@ -148,8 +148,8 @@ class TestText(TestCase):
|
|||||||
self.assertEqual(cleaned, expected)
|
self.assertEqual(cleaned, expected)
|
||||||
|
|
||||||
def test_html_11(self):
|
def test_html_11(self):
|
||||||
text = "<div>\n Hello, \t\n<strong>world! it's it`s u us & | </strong>\n\t</div>"
|
text = "<div>\n Hello, \t\n<strong>world! it's it`s & | </strong>\n\t</div>"
|
||||||
cleaned = clean(
|
cleaned = preprocess(
|
||||||
text,
|
text,
|
||||||
clean_html=True,
|
clean_html=True,
|
||||||
lowercase=True,
|
lowercase=True,
|
||||||
@ -166,7 +166,7 @@ class TestText(TestCase):
|
|||||||
|
|
||||||
def test_bigrams(self):
|
def test_bigrams(self):
|
||||||
text = "x A b c d e f g h"
|
text = "x A b c d e f g h"
|
||||||
cleaned = clean(
|
cleaned = preprocess(
|
||||||
text,
|
text,
|
||||||
lowercase=True,
|
lowercase=True,
|
||||||
bigrams={
|
bigrams={
|
||||||
|
21
test/test_web.py
Normal file
21
test/test_web.py
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
from unittest import TestCase
|
||||||
|
|
||||||
|
from hexlib.web import url_query_value
|
||||||
|
|
||||||
|
|
||||||
|
class TestWebMiscFuncs(TestCase):
|
||||||
|
def test_qs_1(self):
|
||||||
|
url = "https://test.com/page?a=1&b=2&a=2&c=hello"
|
||||||
|
|
||||||
|
self.assertEqual(url_query_value(url, "a"), "1")
|
||||||
|
self.assertEqual(url_query_value(url, "b"), "2")
|
||||||
|
self.assertEqual(url_query_value(url, "c"), "hello")
|
||||||
|
self.assertEqual(url_query_value(url, "D"), None)
|
||||||
|
|
||||||
|
def test_qs_as_list(self):
|
||||||
|
url = "https://test.com/page?a=1&b=2&a=2&c=hello"
|
||||||
|
|
||||||
|
self.assertEqual(url_query_value(url, "a", as_list=True), ["1", "2"])
|
||||||
|
self.assertEqual(url_query_value(url, "b", as_list=True), ["2"])
|
||||||
|
self.assertEqual(url_query_value(url, "c", as_list=True), ["hello"])
|
||||||
|
self.assertEqual(url_query_value(url, "D", as_list=True), [])
|
Loading…
x
Reference in New Issue
Block a user