diff --git a/README.md b/README.md index 866e590..a79a4d4 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ image boards and publishes serialised JSON to RabbitMQ Compatible image boards: 4chan, lainchan, uboachan, 22chan, wizchan, 1chan, 2ch.hk, endchan, 38chan, alokal, horochan, doushio, desuchan, tgchan, lolnada, 7chan, chanon, -chan.org.li. +chan.org.li, hispachan, 8kun, nowere, iichan and more. Can optionally push monitoring data to InfluxDB. Below is an example of Grafana being used to display it. diff --git a/monitoring.png b/monitoring.png index ae76b18..a965698 100644 Binary files a/monitoring.png and b/monitoring.png differ diff --git a/post_process.py b/post_process.py index 158b2c9..a00f316 100644 --- a/post_process.py +++ b/post_process.py @@ -1,17 +1,15 @@ -import base64 import hashlib -import re import zlib from io import BytesIO from urllib.parse import urljoin import imagehash from PIL import Image +from hexlib.imhash import b64hash from util import logger -LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))") -HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"") +from hexlib.regex import HTML_HREF_RE, LINK_RE IMAGE_FILETYPES = ( # :orig for twitter cdn @@ -33,12 +31,6 @@ def _is_image(url): return url.lower().endswith(IMAGE_FILETYPES) -def b64hash(imhash, bcount): - return base64.b64encode( - sum(1 << i for i, b in enumerate(imhash.hash.flatten()) if b).to_bytes(bcount, "big") - ).decode("ascii") - - def image_meta(url, url_idx, web): r = web.get(url) if not r: diff --git a/requirements.txt b/requirements.txt index c0188c8..fe9c1d1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,6 @@ requests[socks] stem influxdb pika -bs4 \ No newline at end of file +bs4 +urllib3 +git+git://github.com/simon987/hexlib.git \ No newline at end of file diff --git a/run.py b/run.py index 1a87491..0b188ce 100644 --- a/run.py +++ b/run.py @@ -19,8 +19,8 @@ BYPASS_RPS = False class ChanScanner: - def __init__(self, helper): - self.web = Web(monitoring if MONITORING else None, rps=helper.rps) + def __init__(self, helper, proxy): + self.web = Web(monitoring if MONITORING else None, rps=helper.rps, proxy=proxy) self.helper = helper self.state = ChanState() @@ -139,9 +139,9 @@ class ChanState: conn.commit() -def publish_worker(queue: Queue, helper): +def publish_worker(queue: Queue, helper, p): channel = connect() - web = Web(monitoring if MONITORING else None, rps=helper.rps) + web = Web(monitoring if MONITORING else None, rps=helper.rps, proxy=p) while True: try: @@ -203,6 +203,12 @@ if __name__ == "__main__": chan = sys.argv[2] chan_helper = CHANS[chan] + proxy = None + if len(sys.argv) > 3: + proxy = sys.argv[3] + logger.info("Using proxy %s" % proxy) + + BYPASS_RPS = True if BYPASS_RPS: chan_helper.rps = 10 @@ -212,10 +218,10 @@ if __name__ == "__main__": publish_q = Queue() for _ in range(5): - publish_thread = Thread(target=publish_worker, args=(publish_q, chan_helper)) + publish_thread = Thread(target=publish_worker, args=(publish_q, chan_helper, proxy)) publish_thread.start() - s = ChanScanner(chan_helper) + s = ChanScanner(chan_helper, proxy) while True: for p, b in s.all_posts(): publish_q.put((p, b)) diff --git a/util.py b/util.py index cd7d30a..8bb8e7a 100644 --- a/util.py +++ b/util.py @@ -1,11 +1,14 @@ import logging import sys -import time import traceback from datetime import datetime from logging import FileHandler, StreamHandler import requests +from hexlib.misc import rate_limit +from urllib3 import disable_warnings + +disable_warnings() last_time_called = dict() @@ -21,35 +24,26 @@ logger.addHandler(file_handler) logger.addHandler(StreamHandler(sys.stdout)) -def rate_limit(per_second): - min_interval = 1.0 / float(per_second) - - def decorate(func): - last_time_called[func] = 0 - - def wrapper(*args, **kwargs): - elapsed = time.perf_counter() - last_time_called[func] - wait_time = min_interval - elapsed - if wait_time > 0: - time.sleep(wait_time) - - last_time_called[func] = time.perf_counter() - return func(*args, **kwargs) - - return wrapper - - return decorate - - class Web: - def __init__(self, monitoring, rps=1/2): + def __init__(self, monitoring, rps=1 / 2, proxy=None): self.session = requests.Session() + if proxy: + self.session.proxies = {"http": proxy, "https": proxy} + self.session.verify = False self._rps = rps self.monitoring = monitoring @rate_limit(self._rps) def _get(url, **kwargs): - return self.session.get(url, **kwargs) + retries = 3 + + while retries > 0: + retries -= 1 + try: + return self.session.get(url, **kwargs) + except Exception as e: + logger.warning("Error with request %s: %s" % (url, str(e))) + raise Exception("Gave up request after maximum number of retries") self._get = _get