Rename test.clean to text.preprocess, add QS util func, more debug logging

This commit is contained in:
simon987 2021-04-25 12:10:03 -04:00
parent 9002ae7506
commit 75bf2c2d85
6 changed files with 63 additions and 26 deletions

View File

@ -53,6 +53,8 @@ def get_web(session=None):
stdout_logger.debug("Web>cipherSuite=%s" % web._session.cipherSuite) stdout_logger.debug("Web>cipherSuite=%s" % web._session.cipherSuite)
if hasattr(web._session, "headers"): if hasattr(web._session, "headers"):
stdout_logger.debug("Web>headers=%s" % web._session.headers) stdout_logger.debug("Web>headers=%s" % web._session.headers)
if hasattr(web._session, "cookies"):
stdout_logger.debug("Web>cookies=%s" % web._session.cookies)
stdout_logger.debug("Web>rps=%s" % os.environ.get("RPS", 1)) stdout_logger.debug("Web>rps=%s" % os.environ.get("RPS", 1))

View File

@ -27,7 +27,7 @@ lemmatizer = WordNetLemmatizer()
def clean_multicore(texts, processes, **kwargs): def clean_multicore(texts, processes, **kwargs):
pool = Pool(processes=processes) pool = Pool(processes=processes)
return pool.map( return pool.map(
func=partial(clean, **kwargs), func=partial(preprocess, **kwargs),
iterable=texts, iterable=texts,
) )
@ -42,9 +42,9 @@ def _transform_bigram(ngram_seq, ngrams):
yield ngram[0] yield ngram[0]
def clean(text, lowercase=False, clean_html=False, strip=False, remove_punctuation=False, def preprocess(text, lowercase=False, clean_html=False, strip=False, remove_punctuation=False,
remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False, remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False,
remove_urls=False, bigrams: set = None): remove_urls=False, bigrams: set = None):
if lowercase: if lowercase:
text = text.lower() text = text.lower()
@ -64,20 +64,19 @@ def clean(text, lowercase=False, clean_html=False, strip=False, remove_punctuati
if remove_punctuation: if remove_punctuation:
text = PUNCTUATION_RE.sub(" ", text) text = PUNCTUATION_RE.sub(" ", text)
if not remove_stopwords_en or not lemmatize or not strip_quotes: text = WHITESPACE_RE.sub(" ", text)
text = WHITESPACE_RE.sub(" ", text)
if strip_quotes: if strip_quotes:
words = WHITESPACE_RE.split(text) words = text.split(" ")
text = " ".join(w.strip("\"'") for w in words) text = " ".join(w.strip("\"'") for w in words)
if bigrams: if bigrams:
words = WHITESPACE_RE.split(text) words = text.split(" ")
words.append("*") words.append("*")
text = " ".join(_transform_bigram(nltk.bigrams(words), bigrams)) text = " ".join(_transform_bigram(nltk.bigrams(words), bigrams))
if remove_stopwords_en or lemmatize: if remove_stopwords_en or lemmatize:
words = WHITESPACE_RE.split(text) words = text.split(" ")
if lemmatize and remove_stopwords_en: if lemmatize and remove_stopwords_en:
text = " ".join(lemmatizer.lemmatize(w) for w in words if w not in stop_words_en) text = " ".join(lemmatizer.lemmatize(w) for w in words if w not in stop_words_en)

View File

@ -5,6 +5,8 @@ from datetime import datetime
from base64 import b64encode, b64decode from base64 import b64encode, b64decode
from http.cookiejar import Cookie from http.cookiejar import Cookie
from time import time from time import time
from urllib.parse import urlparse, parse_qs
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import requests import requests
@ -79,6 +81,18 @@ def cookiejar_filter(cj, pattern):
return filtered_cj return filtered_cj
def url_query_value(url, arg, as_list=False):
qs = urlparse(url).query
parsed_qs = parse_qs(qs)
arg = parsed_qs.get(arg, [])
if as_list:
return arg if arg else []
else:
return arg[0] if arg else None
def download_file(url, destination, session=None, headers=None, overwrite=False, retries=1, err_cb=None, def download_file(url, destination, session=None, headers=None, overwrite=False, retries=1, err_cb=None,
save_meta=False): save_meta=False):
if os.path.exists(destination) and not overwrite: if os.path.exists(destination) and not overwrite:
@ -112,7 +126,8 @@ def download_file(url, destination, session=None, headers=None, overwrite=False,
class Web: class Web:
def __init__(self, proxy=None, rps=1, retries=3, retry_sleep=0, logger=None, cookie_file=None, retry_codes=None, session=None, def __init__(self, proxy=None, rps=1, retries=3, retry_sleep=0, logger=None, cookie_file=None, retry_codes=None,
session=None,
ua=None): ua=None):
self._cookie_file = cookie_file self._cookie_file = cookie_file
self._proxy = proxy self._proxy = proxy

View File

@ -2,7 +2,7 @@ from setuptools import setup
setup( setup(
name="hexlib", name="hexlib",
version="1.41", version="1.42",
description="Misc utility methods", description="Misc utility methods",
author="simon987", author="simon987",
author_email="me@simon987.net", author_email="me@simon987.net",

View File

@ -1,13 +1,13 @@
from unittest import TestCase from unittest import TestCase
from hexlib.text import clean from hexlib.text import preprocess
class TestText(TestCase): class TestText(TestCase):
def test_html_invalid(self): def test_html_invalid(self):
text = "" text = ""
cleaned = clean( cleaned = preprocess(
text, text,
clean_html=True, clean_html=True,
) )
@ -17,7 +17,7 @@ class TestText(TestCase):
def test_html_1(self): def test_html_1(self):
text = "<div>Hello, <strong>world</strong></div>" text = "<div>Hello, <strong>world</strong></div>"
cleaned = clean( cleaned = preprocess(
text, text,
clean_html=True, clean_html=True,
) )
@ -27,7 +27,7 @@ class TestText(TestCase):
def test_html_2(self): def test_html_2(self):
text = "<div>Hello, <strong>world</strong></div>" text = "<div>Hello, <strong>world</strong></div>"
cleaned = clean( cleaned = preprocess(
text, text,
clean_html=True, clean_html=True,
lowercase=True lowercase=True
@ -38,7 +38,7 @@ class TestText(TestCase):
def test_html_3(self): def test_html_3(self):
text = "<div>\n Hello, \t\n<strong> world </strong>\n\t</div>" text = "<div>\n Hello, \t\n<strong> world </strong>\n\t</div>"
cleaned = clean( cleaned = preprocess(
text, text,
clean_html=True, clean_html=True,
lowercase=True, lowercase=True,
@ -49,7 +49,7 @@ class TestText(TestCase):
def test_html_4(self): def test_html_4(self):
text = "<div>\n Hello, \t\n<strong> world </strong>\n\t</div>" text = "<div>\n Hello, \t\n<strong> world </strong>\n\t</div>"
cleaned = clean( cleaned = preprocess(
text, text,
clean_html=True, clean_html=True,
lowercase=True, lowercase=True,
@ -61,7 +61,7 @@ class TestText(TestCase):
def test_html_5(self): def test_html_5(self):
text = "<div>\n Hello, \t\n<strong> world </strong>\n\t</div>" text = "<div>\n Hello, \t\n<strong> world </strong>\n\t</div>"
cleaned = clean( cleaned = preprocess(
text, text,
clean_html=True, clean_html=True,
lowercase=True, lowercase=True,
@ -74,7 +74,7 @@ class TestText(TestCase):
def test_html_6(self): def test_html_6(self):
text = "<div>\n Hello, \t\n<strong>a the world </strong>\n\t</div>" text = "<div>\n Hello, \t\n<strong>a the world </strong>\n\t</div>"
cleaned = clean( cleaned = preprocess(
text, text,
clean_html=True, clean_html=True,
lowercase=True, lowercase=True,
@ -88,7 +88,7 @@ class TestText(TestCase):
def test_html_7(self): def test_html_7(self):
text = "<div>\n Hello, \t\n<strong>a the worlds </strong>\n\t</div>" text = "<div>\n Hello, \t\n<strong>a the worlds </strong>\n\t</div>"
cleaned = clean( cleaned = preprocess(
text, text,
clean_html=True, clean_html=True,
lowercase=True, lowercase=True,
@ -103,7 +103,7 @@ class TestText(TestCase):
def test_html_8(self): def test_html_8(self):
text = "<div>\n Hello, \t\n<strong>a the worlds! </strong>\n\t</div>" text = "<div>\n Hello, \t\n<strong>a the worlds! </strong>\n\t</div>"
cleaned = clean( cleaned = preprocess(
text, text,
clean_html=True, clean_html=True,
lowercase=True, lowercase=True,
@ -118,7 +118,7 @@ class TestText(TestCase):
def test_html_9(self): def test_html_9(self):
text = "<div>\n Hello, \t\n<strong>world! it's it`s </strong>\n\t</div>" text = "<div>\n Hello, \t\n<strong>world! it's it`s </strong>\n\t</div>"
cleaned = clean( cleaned = preprocess(
text, text,
clean_html=True, clean_html=True,
lowercase=True, lowercase=True,
@ -133,7 +133,7 @@ class TestText(TestCase):
def test_html_10(self): def test_html_10(self):
text = "<div>\n Hello, \t\n<strong>world! it's it`s https://google.ca/test/abc.pdf </strong>\n\t</div>" text = "<div>\n Hello, \t\n<strong>world! it's it`s https://google.ca/test/abc.pdf </strong>\n\t</div>"
cleaned = clean( cleaned = preprocess(
text, text,
clean_html=True, clean_html=True,
lowercase=True, lowercase=True,
@ -148,8 +148,8 @@ class TestText(TestCase):
self.assertEqual(cleaned, expected) self.assertEqual(cleaned, expected)
def test_html_11(self): def test_html_11(self):
text = "<div>\n Hello, \t\n<strong>world! it's it`s u us & | </strong>\n\t</div>" text = "<div>\n Hello, \t\n<strong>world! it's it`s & | </strong>\n\t</div>"
cleaned = clean( cleaned = preprocess(
text, text,
clean_html=True, clean_html=True,
lowercase=True, lowercase=True,
@ -166,7 +166,7 @@ class TestText(TestCase):
def test_bigrams(self): def test_bigrams(self):
text = "x A b c d e f g h" text = "x A b c d e f g h"
cleaned = clean( cleaned = preprocess(
text, text,
lowercase=True, lowercase=True,
bigrams={ bigrams={

21
test/test_web.py Normal file
View File

@ -0,0 +1,21 @@
from unittest import TestCase
from hexlib.web import url_query_value
class TestWebMiscFuncs(TestCase):
def test_qs_1(self):
url = "https://test.com/page?a=1&b=2&a=2&c=hello"
self.assertEqual(url_query_value(url, "a"), "1")
self.assertEqual(url_query_value(url, "b"), "2")
self.assertEqual(url_query_value(url, "c"), "hello")
self.assertEqual(url_query_value(url, "D"), None)
def test_qs_as_list(self):
url = "https://test.com/page?a=1&b=2&a=2&c=hello"
self.assertEqual(url_query_value(url, "a", as_list=True), ["1", "2"])
self.assertEqual(url_query_value(url, "b", as_list=True), ["2"])
self.assertEqual(url_query_value(url, "c", as_list=True), ["hello"])
self.assertEqual(url_query_value(url, "D", as_list=True), [])