From 75bf2c2d856698ec1bfa0b42b3bcc708b94bb22f Mon Sep 17 00:00:00 2001
From: simon987 <me@simon987.net>
Date: Sun, 25 Apr 2021 12:10:03 -0400
Subject: [PATCH] Rename test.clean to text.preprocess, add QS util func, more
 debug logging

---
 hexlib/env.py     |  2 ++
 hexlib/text.py    | 17 ++++++++---------
 hexlib/web.py     | 17 ++++++++++++++++-
 setup.py          |  2 +-
 test/test_text.py | 30 +++++++++++++++---------------
 test/test_web.py  | 21 +++++++++++++++++++++
 6 files changed, 63 insertions(+), 26 deletions(-)
 create mode 100644 test/test_web.py

diff --git a/hexlib/env.py b/hexlib/env.py
index 2d23395..80d099c 100644
--- a/hexlib/env.py
+++ b/hexlib/env.py
@@ -53,6 +53,8 @@ def get_web(session=None):
         stdout_logger.debug("Web>cipherSuite=%s" % web._session.cipherSuite)
     if hasattr(web._session, "headers"):
         stdout_logger.debug("Web>headers=%s" % web._session.headers)
+    if hasattr(web._session, "cookies"):
+        stdout_logger.debug("Web>cookies=%s" % web._session.cookies)
 
     stdout_logger.debug("Web>rps=%s" % os.environ.get("RPS", 1))
 
diff --git a/hexlib/text.py b/hexlib/text.py
index b781782..52c901b 100644
--- a/hexlib/text.py
+++ b/hexlib/text.py
@@ -27,7 +27,7 @@ lemmatizer = WordNetLemmatizer()
 def clean_multicore(texts, processes, **kwargs):
     pool = Pool(processes=processes)
     return pool.map(
-        func=partial(clean, **kwargs),
+        func=partial(preprocess, **kwargs),
         iterable=texts,
     )
 
@@ -42,9 +42,9 @@ def _transform_bigram(ngram_seq, ngrams):
             yield ngram[0]
 
 
-def clean(text, lowercase=False, clean_html=False, strip=False, remove_punctuation=False,
-          remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False,
-          remove_urls=False, bigrams: set = None):
+def preprocess(text, lowercase=False, clean_html=False, strip=False, remove_punctuation=False,
+               remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False,
+               remove_urls=False, bigrams: set = None):
     if lowercase:
         text = text.lower()
 
@@ -64,20 +64,19 @@ def clean(text, lowercase=False, clean_html=False, strip=False, remove_punctuati
     if remove_punctuation:
         text = PUNCTUATION_RE.sub(" ", text)
 
-    if not remove_stopwords_en or not lemmatize or not strip_quotes:
-        text = WHITESPACE_RE.sub(" ", text)
+    text = WHITESPACE_RE.sub(" ", text)
 
     if strip_quotes:
-        words = WHITESPACE_RE.split(text)
+        words = text.split(" ")
         text = " ".join(w.strip("\"'") for w in words)
 
     if bigrams:
-        words = WHITESPACE_RE.split(text)
+        words = text.split(" ")
         words.append("*")
         text = " ".join(_transform_bigram(nltk.bigrams(words), bigrams))
 
     if remove_stopwords_en or lemmatize:
-        words = WHITESPACE_RE.split(text)
+        words = text.split(" ")
 
         if lemmatize and remove_stopwords_en:
             text = " ".join(lemmatizer.lemmatize(w) for w in words if w not in stop_words_en)
diff --git a/hexlib/web.py b/hexlib/web.py
index ee0e95f..1fa5b5c 100644
--- a/hexlib/web.py
+++ b/hexlib/web.py
@@ -5,6 +5,8 @@ from datetime import datetime
 from base64 import b64encode, b64decode
 from http.cookiejar import Cookie
 from time import time
+from urllib.parse import urlparse, parse_qs
+
 from bs4 import BeautifulSoup
 
 import requests
@@ -79,6 +81,18 @@ def cookiejar_filter(cj, pattern):
     return filtered_cj
 
 
+def url_query_value(url, arg, as_list=False):
+    qs = urlparse(url).query
+    parsed_qs = parse_qs(qs)
+
+    arg = parsed_qs.get(arg, [])
+
+    if as_list:
+        return arg if arg else []
+    else:
+        return arg[0] if arg else None
+
+
 def download_file(url, destination, session=None, headers=None, overwrite=False, retries=1, err_cb=None,
                   save_meta=False):
     if os.path.exists(destination) and not overwrite:
@@ -112,7 +126,8 @@ def download_file(url, destination, session=None, headers=None, overwrite=False,
 
 
 class Web:
-    def __init__(self, proxy=None, rps=1, retries=3, retry_sleep=0, logger=None, cookie_file=None, retry_codes=None, session=None,
+    def __init__(self, proxy=None, rps=1, retries=3, retry_sleep=0, logger=None, cookie_file=None, retry_codes=None,
+                 session=None,
                  ua=None):
         self._cookie_file = cookie_file
         self._proxy = proxy
diff --git a/setup.py b/setup.py
index 36f655e..620a988 100644
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@ from setuptools import setup
 
 setup(
     name="hexlib",
-    version="1.41",
+    version="1.42",
     description="Misc utility methods",
     author="simon987",
     author_email="me@simon987.net",
diff --git a/test/test_text.py b/test/test_text.py
index 334f456..441e84b 100644
--- a/test/test_text.py
+++ b/test/test_text.py
@@ -1,13 +1,13 @@
 from unittest import TestCase
 
-from hexlib.text import clean
+from hexlib.text import preprocess
 
 
 class TestText(TestCase):
 
     def test_html_invalid(self):
         text = ""
-        cleaned = clean(
+        cleaned = preprocess(
             text,
             clean_html=True,
         )
@@ -17,7 +17,7 @@ class TestText(TestCase):
 
     def test_html_1(self):
         text = "<div>Hello, <strong>world</strong></div>"
-        cleaned = clean(
+        cleaned = preprocess(
             text,
             clean_html=True,
         )
@@ -27,7 +27,7 @@ class TestText(TestCase):
 
     def test_html_2(self):
         text = "<div>Hello, <strong>world</strong></div>"
-        cleaned = clean(
+        cleaned = preprocess(
             text,
             clean_html=True,
             lowercase=True
@@ -38,7 +38,7 @@ class TestText(TestCase):
 
     def test_html_3(self):
         text = "<div>\n Hello, \t\n<strong> world    </strong>\n\t</div>"
-        cleaned = clean(
+        cleaned = preprocess(
             text,
             clean_html=True,
             lowercase=True,
@@ -49,7 +49,7 @@ class TestText(TestCase):
 
     def test_html_4(self):
         text = "<div>\n Hello, \t\n<strong> world    </strong>\n\t</div>"
-        cleaned = clean(
+        cleaned = preprocess(
             text,
             clean_html=True,
             lowercase=True,
@@ -61,7 +61,7 @@ class TestText(TestCase):
 
     def test_html_5(self):
         text = "<div>\n Hello, \t\n<strong> world    </strong>\n\t</div>"
-        cleaned = clean(
+        cleaned = preprocess(
             text,
             clean_html=True,
             lowercase=True,
@@ -74,7 +74,7 @@ class TestText(TestCase):
 
     def test_html_6(self):
         text = "<div>\n Hello, \t\n<strong>a the world    </strong>\n\t</div>"
-        cleaned = clean(
+        cleaned = preprocess(
             text,
             clean_html=True,
             lowercase=True,
@@ -88,7 +88,7 @@ class TestText(TestCase):
 
     def test_html_7(self):
         text = "<div>\n Hello, \t\n<strong>a the worlds    </strong>\n\t</div>"
-        cleaned = clean(
+        cleaned = preprocess(
             text,
             clean_html=True,
             lowercase=True,
@@ -103,7 +103,7 @@ class TestText(TestCase):
 
     def test_html_8(self):
         text = "<div>\n Hello, \t\n<strong>a the worlds!    </strong>\n\t</div>"
-        cleaned = clean(
+        cleaned = preprocess(
             text,
             clean_html=True,
             lowercase=True,
@@ -118,7 +118,7 @@ class TestText(TestCase):
 
     def test_html_9(self):
         text = "<div>\n Hello, \t\n<strong>world! it's it`s   </strong>\n\t</div>"
-        cleaned = clean(
+        cleaned = preprocess(
             text,
             clean_html=True,
             lowercase=True,
@@ -133,7 +133,7 @@ class TestText(TestCase):
 
     def test_html_10(self):
         text = "<div>\n Hello, \t\n<strong>world! it's it`s https://google.ca/test/abc.pdf  </strong>\n\t</div>"
-        cleaned = clean(
+        cleaned = preprocess(
             text,
             clean_html=True,
             lowercase=True,
@@ -148,8 +148,8 @@ class TestText(TestCase):
         self.assertEqual(cleaned, expected)
 
     def test_html_11(self):
-        text = "<div>\n Hello, \t\n<strong>world! it's it`s u us & | </strong>\n\t</div>"
-        cleaned = clean(
+        text = "<div>\n Hello, \t\n<strong>world! it's it`s & | </strong>\n\t</div>"
+        cleaned = preprocess(
             text,
             clean_html=True,
             lowercase=True,
@@ -166,7 +166,7 @@ class TestText(TestCase):
 
     def test_bigrams(self):
         text = "x A b c d e f g h"
-        cleaned = clean(
+        cleaned = preprocess(
             text,
             lowercase=True,
             bigrams={
diff --git a/test/test_web.py b/test/test_web.py
new file mode 100644
index 0000000..3ab0347
--- /dev/null
+++ b/test/test_web.py
@@ -0,0 +1,21 @@
+from unittest import TestCase
+
+from hexlib.web import url_query_value
+
+
+class TestWebMiscFuncs(TestCase):
+    def test_qs_1(self):
+        url = "https://test.com/page?a=1&b=2&a=2&c=hello"
+
+        self.assertEqual(url_query_value(url, "a"), "1")
+        self.assertEqual(url_query_value(url, "b"), "2")
+        self.assertEqual(url_query_value(url, "c"), "hello")
+        self.assertEqual(url_query_value(url, "D"), None)
+
+    def test_qs_as_list(self):
+        url = "https://test.com/page?a=1&b=2&a=2&c=hello"
+
+        self.assertEqual(url_query_value(url, "a", as_list=True), ["1", "2"])
+        self.assertEqual(url_query_value(url, "b", as_list=True), ["2"])
+        self.assertEqual(url_query_value(url, "c", as_list=True), ["hello"])
+        self.assertEqual(url_query_value(url, "D", as_list=True), [])