Optional proxy, small refactor

This commit is contained in:
simon 2019-11-19 19:10:09 -05:00
parent b6425713a8
commit 65712e2ca1
6 changed files with 35 additions and 41 deletions

View File

@ -7,7 +7,7 @@ image boards and publishes serialised JSON to RabbitMQ
Compatible image boards: 4chan, lainchan, uboachan, Compatible image boards: 4chan, lainchan, uboachan,
22chan, wizchan, 1chan, 2ch.hk, endchan, 38chan, alokal, 22chan, wizchan, 1chan, 2ch.hk, endchan, 38chan, alokal,
horochan, doushio, desuchan, tgchan, lolnada, 7chan, chanon, horochan, doushio, desuchan, tgchan, lolnada, 7chan, chanon,
chan.org.li. chan.org.li, hispachan, 8kun, nowere, iichan and more.
Can optionally push monitoring data to InfluxDB. Below is an Can optionally push monitoring data to InfluxDB. Below is an
example of Grafana being used to display it. example of Grafana being used to display it.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 87 KiB

After

Width:  |  Height:  |  Size: 366 KiB

View File

@ -1,17 +1,15 @@
import base64
import hashlib import hashlib
import re
import zlib import zlib
from io import BytesIO from io import BytesIO
from urllib.parse import urljoin from urllib.parse import urljoin
import imagehash import imagehash
from PIL import Image from PIL import Image
from hexlib.imhash import b64hash
from util import logger from util import logger
LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))") from hexlib.regex import HTML_HREF_RE, LINK_RE
HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"")
IMAGE_FILETYPES = ( IMAGE_FILETYPES = (
# :orig for twitter cdn # :orig for twitter cdn
@ -33,12 +31,6 @@ def _is_image(url):
return url.lower().endswith(IMAGE_FILETYPES) return url.lower().endswith(IMAGE_FILETYPES)
def b64hash(imhash, bcount):
return base64.b64encode(
sum(1 << i for i, b in enumerate(imhash.hash.flatten()) if b).to_bytes(bcount, "big")
).decode("ascii")
def image_meta(url, url_idx, web): def image_meta(url, url_idx, web):
r = web.get(url) r = web.get(url)
if not r: if not r:

View File

@ -6,3 +6,5 @@ stem
influxdb influxdb
pika pika
bs4 bs4
urllib3
git+git://github.com/simon987/hexlib.git

18
run.py
View File

@ -19,8 +19,8 @@ BYPASS_RPS = False
class ChanScanner: class ChanScanner:
def __init__(self, helper): def __init__(self, helper, proxy):
self.web = Web(monitoring if MONITORING else None, rps=helper.rps) self.web = Web(monitoring if MONITORING else None, rps=helper.rps, proxy=proxy)
self.helper = helper self.helper = helper
self.state = ChanState() self.state = ChanState()
@ -139,9 +139,9 @@ class ChanState:
conn.commit() conn.commit()
def publish_worker(queue: Queue, helper): def publish_worker(queue: Queue, helper, p):
channel = connect() channel = connect()
web = Web(monitoring if MONITORING else None, rps=helper.rps) web = Web(monitoring if MONITORING else None, rps=helper.rps, proxy=p)
while True: while True:
try: try:
@ -203,6 +203,12 @@ if __name__ == "__main__":
chan = sys.argv[2] chan = sys.argv[2]
chan_helper = CHANS[chan] chan_helper = CHANS[chan]
proxy = None
if len(sys.argv) > 3:
proxy = sys.argv[3]
logger.info("Using proxy %s" % proxy)
BYPASS_RPS = True
if BYPASS_RPS: if BYPASS_RPS:
chan_helper.rps = 10 chan_helper.rps = 10
@ -212,10 +218,10 @@ if __name__ == "__main__":
publish_q = Queue() publish_q = Queue()
for _ in range(5): for _ in range(5):
publish_thread = Thread(target=publish_worker, args=(publish_q, chan_helper)) publish_thread = Thread(target=publish_worker, args=(publish_q, chan_helper, proxy))
publish_thread.start() publish_thread.start()
s = ChanScanner(chan_helper) s = ChanScanner(chan_helper, proxy)
while True: while True:
for p, b in s.all_posts(): for p, b in s.all_posts():
publish_q.put((p, b)) publish_q.put((p, b))

38
util.py
View File

@ -1,11 +1,14 @@
import logging import logging
import sys import sys
import time
import traceback import traceback
from datetime import datetime from datetime import datetime
from logging import FileHandler, StreamHandler from logging import FileHandler, StreamHandler
import requests import requests
from hexlib.misc import rate_limit
from urllib3 import disable_warnings
disable_warnings()
last_time_called = dict() last_time_called = dict()
@ -21,35 +24,26 @@ logger.addHandler(file_handler)
logger.addHandler(StreamHandler(sys.stdout)) logger.addHandler(StreamHandler(sys.stdout))
def rate_limit(per_second):
min_interval = 1.0 / float(per_second)
def decorate(func):
last_time_called[func] = 0
def wrapper(*args, **kwargs):
elapsed = time.perf_counter() - last_time_called[func]
wait_time = min_interval - elapsed
if wait_time > 0:
time.sleep(wait_time)
last_time_called[func] = time.perf_counter()
return func(*args, **kwargs)
return wrapper
return decorate
class Web: class Web:
def __init__(self, monitoring, rps=1/2): def __init__(self, monitoring, rps=1 / 2, proxy=None):
self.session = requests.Session() self.session = requests.Session()
if proxy:
self.session.proxies = {"http": proxy, "https": proxy}
self.session.verify = False
self._rps = rps self._rps = rps
self.monitoring = monitoring self.monitoring = monitoring
@rate_limit(self._rps) @rate_limit(self._rps)
def _get(url, **kwargs): def _get(url, **kwargs):
retries = 3
while retries > 0:
retries -= 1
try:
return self.session.get(url, **kwargs) return self.session.get(url, **kwargs)
except Exception as e:
logger.warning("Error with request %s: %s" % (url, str(e)))
raise Exception("Gave up request after maximum number of retries")
self._get = _get self._get = _get