Optional proxy, small refactor

2025-10-16 01:26:54 +00:00 · 2019-11-19 19:10:09 -05:00 · 2019-11-19 19:10:09 -05:00 · 65712e2ca1
commit 65712e2ca1
parent b6425713a8
6 changed files with 35 additions and 41 deletions
--- a/README.md
+++ b/README.md
@ -7,7 +7,7 @@ image boards and publishes serialised JSON to RabbitMQ
 Compatible image boards: 4chan, lainchan, uboachan,
 22chan, wizchan, 1chan, 2ch.hk, endchan, 38chan, alokal,
 horochan, doushio, desuchan, tgchan, lolnada, 7chan, chanon,
-chan.org.li.
+chan.org.li, hispachan, 8kun, nowere, iichan and more.
 Can optionally push monitoring data to InfluxDB. Below is an
 example of Grafana being used to display it.
--- a/monitoring.png
+++ b/monitoring.png
--- a/post_process.py
+++ b/post_process.py
@ -1,17 +1,15 @@
 import base64
 import hashlib
 import re
 import zlib
 from io import BytesIO
 from urllib.parse import urljoin
 import imagehash
 from PIL import Image
 from hexlib.imhash import b64hash
 from util import logger
-LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))")
+from hexlib.regex import HTML_HREF_RE, LINK_RE
 HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"")
 IMAGE_FILETYPES = (
    # :orig for twitter cdn
@ -33,12 +31,6 @@ def _is_image(url):
    return url.lower().endswith(IMAGE_FILETYPES)
 def b64hash(imhash, bcount):
    return base64.b64encode(
        sum(1 << i for i, b in enumerate(imhash.hash.flatten()) if b).to_bytes(bcount, "big")
    ).decode("ascii")
 def image_meta(url, url_idx, web):
    r = web.get(url)
    if not r:
--- a/requirements.txt
+++ b/requirements.txt
@ -6,3 +6,5 @@ stem
 influxdb
 pika
 bs4
 urllib3
 git+git://github.com/simon987/hexlib.git
--- a/run.py
+++ b/run.py
@ -19,8 +19,8 @@ BYPASS_RPS = False
 class ChanScanner:
-    def __init__(self, helper):
+    def __init__(self, helper, proxy):
-        self.web = Web(monitoring if MONITORING else None, rps=helper.rps)
+        self.web = Web(monitoring if MONITORING else None, rps=helper.rps, proxy=proxy)
        self.helper = helper
        self.state = ChanState()
@ -139,9 +139,9 @@ class ChanState:
            conn.commit()
-def publish_worker(queue: Queue, helper):
+def publish_worker(queue: Queue, helper, p):
    channel = connect()
-    web = Web(monitoring if MONITORING else None, rps=helper.rps)
+    web = Web(monitoring if MONITORING else None, rps=helper.rps, proxy=p)
    while True:
        try:
@ -203,6 +203,12 @@ if __name__ == "__main__":
    chan = sys.argv[2]
    chan_helper = CHANS[chan]
    proxy = None
    if len(sys.argv) > 3:
        proxy = sys.argv[3]
        logger.info("Using proxy %s" % proxy)
    BYPASS_RPS = True
    if BYPASS_RPS:
        chan_helper.rps = 10
@ -212,10 +218,10 @@ if __name__ == "__main__":
    publish_q = Queue()
    for _ in range(5):
-        publish_thread = Thread(target=publish_worker, args=(publish_q, chan_helper))
+        publish_thread = Thread(target=publish_worker, args=(publish_q, chan_helper, proxy))
        publish_thread.start()
-    s = ChanScanner(chan_helper)
+    s = ChanScanner(chan_helper, proxy)
    while True:
        for p, b in s.all_posts():
            publish_q.put((p, b))
--- a/util.py
+++ b/util.py
@ -1,11 +1,14 @@
 import logging
 import sys
 import time
 import traceback
 from datetime import datetime
 from logging import FileHandler, StreamHandler
 import requests
 from hexlib.misc import rate_limit
 from urllib3 import disable_warnings
 disable_warnings()
 last_time_called = dict()
@ -21,35 +24,26 @@ logger.addHandler(file_handler)
 logger.addHandler(StreamHandler(sys.stdout))
 def rate_limit(per_second):
    min_interval = 1.0 / float(per_second)
    def decorate(func):
        last_time_called[func] = 0
        def wrapper(*args, **kwargs):
            elapsed = time.perf_counter() - last_time_called[func]
            wait_time = min_interval - elapsed
            if wait_time > 0:
                time.sleep(wait_time)
            last_time_called[func] = time.perf_counter()
            return func(*args, **kwargs)
        return wrapper
    return decorate
 class Web:
-    def __init__(self, monitoring, rps=1/2):
+    def __init__(self, monitoring, rps=1 / 2, proxy=None):
        self.session = requests.Session()
        if proxy:
            self.session.proxies = {"http": proxy, "https": proxy}
            self.session.verify = False
        self._rps = rps
        self.monitoring = monitoring
        @rate_limit(self._rps)
        def _get(url, **kwargs):
            retries = 3
            while retries > 0:
                retries -= 1
                try:
                    return self.session.get(url, **kwargs)
                except Exception as e:
                    logger.warning("Error with request %s: %s" % (url, str(e)))
            raise Exception("Gave up request after maximum number of retries")
        self._get = _get