From 75bf2c2d856698ec1bfa0b42b3bcc708b94bb22f Mon Sep 17 00:00:00 2001 From: simon987 Date: Sun, 25 Apr 2021 12:10:03 -0400 Subject: [PATCH] Rename test.clean to text.preprocess, add QS util func, more debug logging --- hexlib/env.py | 2 ++ hexlib/text.py | 17 ++++++++--------- hexlib/web.py | 17 ++++++++++++++++- setup.py | 2 +- test/test_text.py | 30 +++++++++++++++--------------- test/test_web.py | 21 +++++++++++++++++++++ 6 files changed, 63 insertions(+), 26 deletions(-) create mode 100644 test/test_web.py diff --git a/hexlib/env.py b/hexlib/env.py index 2d23395..80d099c 100644 --- a/hexlib/env.py +++ b/hexlib/env.py @@ -53,6 +53,8 @@ def get_web(session=None): stdout_logger.debug("Web>cipherSuite=%s" % web._session.cipherSuite) if hasattr(web._session, "headers"): stdout_logger.debug("Web>headers=%s" % web._session.headers) + if hasattr(web._session, "cookies"): + stdout_logger.debug("Web>cookies=%s" % web._session.cookies) stdout_logger.debug("Web>rps=%s" % os.environ.get("RPS", 1)) diff --git a/hexlib/text.py b/hexlib/text.py index b781782..52c901b 100644 --- a/hexlib/text.py +++ b/hexlib/text.py @@ -27,7 +27,7 @@ lemmatizer = WordNetLemmatizer() def clean_multicore(texts, processes, **kwargs): pool = Pool(processes=processes) return pool.map( - func=partial(clean, **kwargs), + func=partial(preprocess, **kwargs), iterable=texts, ) @@ -42,9 +42,9 @@ def _transform_bigram(ngram_seq, ngrams): yield ngram[0] -def clean(text, lowercase=False, clean_html=False, strip=False, remove_punctuation=False, - remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False, - remove_urls=False, bigrams: set = None): +def preprocess(text, lowercase=False, clean_html=False, strip=False, remove_punctuation=False, + remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False, + remove_urls=False, bigrams: set = None): if lowercase: text = text.lower() @@ -64,20 +64,19 @@ def clean(text, lowercase=False, clean_html=False, strip=False, remove_punctuati if remove_punctuation: text = PUNCTUATION_RE.sub(" ", text) - if not remove_stopwords_en or not lemmatize or not strip_quotes: - text = WHITESPACE_RE.sub(" ", text) + text = WHITESPACE_RE.sub(" ", text) if strip_quotes: - words = WHITESPACE_RE.split(text) + words = text.split(" ") text = " ".join(w.strip("\"'") for w in words) if bigrams: - words = WHITESPACE_RE.split(text) + words = text.split(" ") words.append("*") text = " ".join(_transform_bigram(nltk.bigrams(words), bigrams)) if remove_stopwords_en or lemmatize: - words = WHITESPACE_RE.split(text) + words = text.split(" ") if lemmatize and remove_stopwords_en: text = " ".join(lemmatizer.lemmatize(w) for w in words if w not in stop_words_en) diff --git a/hexlib/web.py b/hexlib/web.py index ee0e95f..1fa5b5c 100644 --- a/hexlib/web.py +++ b/hexlib/web.py @@ -5,6 +5,8 @@ from datetime import datetime from base64 import b64encode, b64decode from http.cookiejar import Cookie from time import time +from urllib.parse import urlparse, parse_qs + from bs4 import BeautifulSoup import requests @@ -79,6 +81,18 @@ def cookiejar_filter(cj, pattern): return filtered_cj +def url_query_value(url, arg, as_list=False): + qs = urlparse(url).query + parsed_qs = parse_qs(qs) + + arg = parsed_qs.get(arg, []) + + if as_list: + return arg if arg else [] + else: + return arg[0] if arg else None + + def download_file(url, destination, session=None, headers=None, overwrite=False, retries=1, err_cb=None, save_meta=False): if os.path.exists(destination) and not overwrite: @@ -112,7 +126,8 @@ def download_file(url, destination, session=None, headers=None, overwrite=False, class Web: - def __init__(self, proxy=None, rps=1, retries=3, retry_sleep=0, logger=None, cookie_file=None, retry_codes=None, session=None, + def __init__(self, proxy=None, rps=1, retries=3, retry_sleep=0, logger=None, cookie_file=None, retry_codes=None, + session=None, ua=None): self._cookie_file = cookie_file self._proxy = proxy diff --git a/setup.py b/setup.py index 36f655e..620a988 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup setup( name="hexlib", - version="1.41", + version="1.42", description="Misc utility methods", author="simon987", author_email="me@simon987.net", diff --git a/test/test_text.py b/test/test_text.py index 334f456..441e84b 100644 --- a/test/test_text.py +++ b/test/test_text.py @@ -1,13 +1,13 @@ from unittest import TestCase -from hexlib.text import clean +from hexlib.text import preprocess class TestText(TestCase): def test_html_invalid(self): text = "" - cleaned = clean( + cleaned = preprocess( text, clean_html=True, ) @@ -17,7 +17,7 @@ class TestText(TestCase): def test_html_1(self): text = "
Hello, world
" - cleaned = clean( + cleaned = preprocess( text, clean_html=True, ) @@ -27,7 +27,7 @@ class TestText(TestCase): def test_html_2(self): text = "
Hello, world
" - cleaned = clean( + cleaned = preprocess( text, clean_html=True, lowercase=True @@ -38,7 +38,7 @@ class TestText(TestCase): def test_html_3(self): text = "
\n Hello, \t\n world \n\t
" - cleaned = clean( + cleaned = preprocess( text, clean_html=True, lowercase=True, @@ -49,7 +49,7 @@ class TestText(TestCase): def test_html_4(self): text = "
\n Hello, \t\n world \n\t
" - cleaned = clean( + cleaned = preprocess( text, clean_html=True, lowercase=True, @@ -61,7 +61,7 @@ class TestText(TestCase): def test_html_5(self): text = "
\n Hello, \t\n world \n\t
" - cleaned = clean( + cleaned = preprocess( text, clean_html=True, lowercase=True, @@ -74,7 +74,7 @@ class TestText(TestCase): def test_html_6(self): text = "
\n Hello, \t\na the world \n\t
" - cleaned = clean( + cleaned = preprocess( text, clean_html=True, lowercase=True, @@ -88,7 +88,7 @@ class TestText(TestCase): def test_html_7(self): text = "
\n Hello, \t\na the worlds \n\t
" - cleaned = clean( + cleaned = preprocess( text, clean_html=True, lowercase=True, @@ -103,7 +103,7 @@ class TestText(TestCase): def test_html_8(self): text = "
\n Hello, \t\na the worlds! \n\t
" - cleaned = clean( + cleaned = preprocess( text, clean_html=True, lowercase=True, @@ -118,7 +118,7 @@ class TestText(TestCase): def test_html_9(self): text = "
\n Hello, \t\nworld! it's it`s \n\t
" - cleaned = clean( + cleaned = preprocess( text, clean_html=True, lowercase=True, @@ -133,7 +133,7 @@ class TestText(TestCase): def test_html_10(self): text = "
\n Hello, \t\nworld! it's it`s https://google.ca/test/abc.pdf \n\t
" - cleaned = clean( + cleaned = preprocess( text, clean_html=True, lowercase=True, @@ -148,8 +148,8 @@ class TestText(TestCase): self.assertEqual(cleaned, expected) def test_html_11(self): - text = "
\n Hello, \t\nworld! it's it`s u us & | \n\t
" - cleaned = clean( + text = "
\n Hello, \t\nworld! it's it`s & | \n\t
" + cleaned = preprocess( text, clean_html=True, lowercase=True, @@ -166,7 +166,7 @@ class TestText(TestCase): def test_bigrams(self): text = "x A b c d e f g h" - cleaned = clean( + cleaned = preprocess( text, lowercase=True, bigrams={ diff --git a/test/test_web.py b/test/test_web.py new file mode 100644 index 0000000..3ab0347 --- /dev/null +++ b/test/test_web.py @@ -0,0 +1,21 @@ +from unittest import TestCase + +from hexlib.web import url_query_value + + +class TestWebMiscFuncs(TestCase): + def test_qs_1(self): + url = "https://test.com/page?a=1&b=2&a=2&c=hello" + + self.assertEqual(url_query_value(url, "a"), "1") + self.assertEqual(url_query_value(url, "b"), "2") + self.assertEqual(url_query_value(url, "c"), "hello") + self.assertEqual(url_query_value(url, "D"), None) + + def test_qs_as_list(self): + url = "https://test.com/page?a=1&b=2&a=2&c=hello" + + self.assertEqual(url_query_value(url, "a", as_list=True), ["1", "2"]) + self.assertEqual(url_query_value(url, "b", as_list=True), ["2"]) + self.assertEqual(url_query_value(url, "c", as_list=True), ["hello"]) + self.assertEqual(url_query_value(url, "D", as_list=True), [])