diff --git a/hexlib/env.py b/hexlib/env.py index 2d23395..80d099c 100644 --- a/hexlib/env.py +++ b/hexlib/env.py @@ -53,6 +53,8 @@ def get_web(session=None): stdout_logger.debug("Web>cipherSuite=%s" % web._session.cipherSuite) if hasattr(web._session, "headers"): stdout_logger.debug("Web>headers=%s" % web._session.headers) + if hasattr(web._session, "cookies"): + stdout_logger.debug("Web>cookies=%s" % web._session.cookies) stdout_logger.debug("Web>rps=%s" % os.environ.get("RPS", 1)) diff --git a/hexlib/text.py b/hexlib/text.py index b781782..52c901b 100644 --- a/hexlib/text.py +++ b/hexlib/text.py @@ -27,7 +27,7 @@ lemmatizer = WordNetLemmatizer() def clean_multicore(texts, processes, **kwargs): pool = Pool(processes=processes) return pool.map( - func=partial(clean, **kwargs), + func=partial(preprocess, **kwargs), iterable=texts, ) @@ -42,9 +42,9 @@ def _transform_bigram(ngram_seq, ngrams): yield ngram[0] -def clean(text, lowercase=False, clean_html=False, strip=False, remove_punctuation=False, - remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False, - remove_urls=False, bigrams: set = None): +def preprocess(text, lowercase=False, clean_html=False, strip=False, remove_punctuation=False, + remove_stopwords_en=False, lemmatize=False, fix_single_quotes=False, strip_quotes=False, + remove_urls=False, bigrams: set = None): if lowercase: text = text.lower() @@ -64,20 +64,19 @@ def clean(text, lowercase=False, clean_html=False, strip=False, remove_punctuati if remove_punctuation: text = PUNCTUATION_RE.sub(" ", text) - if not remove_stopwords_en or not lemmatize or not strip_quotes: - text = WHITESPACE_RE.sub(" ", text) + text = WHITESPACE_RE.sub(" ", text) if strip_quotes: - words = WHITESPACE_RE.split(text) + words = text.split(" ") text = " ".join(w.strip("\"'") for w in words) if bigrams: - words = WHITESPACE_RE.split(text) + words = text.split(" ") words.append("*") text = " ".join(_transform_bigram(nltk.bigrams(words), bigrams)) if remove_stopwords_en or lemmatize: - words = WHITESPACE_RE.split(text) + words = text.split(" ") if lemmatize and remove_stopwords_en: text = " ".join(lemmatizer.lemmatize(w) for w in words if w not in stop_words_en) diff --git a/hexlib/web.py b/hexlib/web.py index ee0e95f..1fa5b5c 100644 --- a/hexlib/web.py +++ b/hexlib/web.py @@ -5,6 +5,8 @@ from datetime import datetime from base64 import b64encode, b64decode from http.cookiejar import Cookie from time import time +from urllib.parse import urlparse, parse_qs + from bs4 import BeautifulSoup import requests @@ -79,6 +81,18 @@ def cookiejar_filter(cj, pattern): return filtered_cj +def url_query_value(url, arg, as_list=False): + qs = urlparse(url).query + parsed_qs = parse_qs(qs) + + arg = parsed_qs.get(arg, []) + + if as_list: + return arg if arg else [] + else: + return arg[0] if arg else None + + def download_file(url, destination, session=None, headers=None, overwrite=False, retries=1, err_cb=None, save_meta=False): if os.path.exists(destination) and not overwrite: @@ -112,7 +126,8 @@ def download_file(url, destination, session=None, headers=None, overwrite=False, class Web: - def __init__(self, proxy=None, rps=1, retries=3, retry_sleep=0, logger=None, cookie_file=None, retry_codes=None, session=None, + def __init__(self, proxy=None, rps=1, retries=3, retry_sleep=0, logger=None, cookie_file=None, retry_codes=None, + session=None, ua=None): self._cookie_file = cookie_file self._proxy = proxy diff --git a/setup.py b/setup.py index 36f655e..620a988 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup setup( name="hexlib", - version="1.41", + version="1.42", description="Misc utility methods", author="simon987", author_email="me@simon987.net", diff --git a/test/test_text.py b/test/test_text.py index 334f456..441e84b 100644 --- a/test/test_text.py +++ b/test/test_text.py @@ -1,13 +1,13 @@ from unittest import TestCase -from hexlib.text import clean +from hexlib.text import preprocess class TestText(TestCase): def test_html_invalid(self): text = "" - cleaned = clean( + cleaned = preprocess( text, clean_html=True, ) @@ -17,7 +17,7 @@ class TestText(TestCase): def test_html_1(self): text = "