Optional proxy, small refactor

This commit is contained in:
simon 2019-11-19 19:10:09 -05:00
parent b6425713a8
commit 65712e2ca1
6 changed files with 35 additions and 41 deletions

View File

@ -7,7 +7,7 @@ image boards and publishes serialised JSON to RabbitMQ
Compatible image boards: 4chan, lainchan, uboachan,
22chan, wizchan, 1chan, 2ch.hk, endchan, 38chan, alokal,
horochan, doushio, desuchan, tgchan, lolnada, 7chan, chanon,
chan.org.li.
chan.org.li, hispachan, 8kun, nowere, iichan and more.
Can optionally push monitoring data to InfluxDB. Below is an
example of Grafana being used to display it.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 87 KiB

After

Width:  |  Height:  |  Size: 366 KiB

View File

@ -1,17 +1,15 @@
import base64
import hashlib
import re
import zlib
from io import BytesIO
from urllib.parse import urljoin
import imagehash
from PIL import Image
from hexlib.imhash import b64hash
from util import logger
LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))")
HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"")
from hexlib.regex import HTML_HREF_RE, LINK_RE
IMAGE_FILETYPES = (
# :orig for twitter cdn
@ -33,12 +31,6 @@ def _is_image(url):
return url.lower().endswith(IMAGE_FILETYPES)
def b64hash(imhash, bcount):
return base64.b64encode(
sum(1 << i for i, b in enumerate(imhash.hash.flatten()) if b).to_bytes(bcount, "big")
).decode("ascii")
def image_meta(url, url_idx, web):
r = web.get(url)
if not r:

View File

@ -5,4 +5,6 @@ requests[socks]
stem
influxdb
pika
bs4
bs4
urllib3
git+git://github.com/simon987/hexlib.git

18
run.py
View File

@ -19,8 +19,8 @@ BYPASS_RPS = False
class ChanScanner:
def __init__(self, helper):
self.web = Web(monitoring if MONITORING else None, rps=helper.rps)
def __init__(self, helper, proxy):
self.web = Web(monitoring if MONITORING else None, rps=helper.rps, proxy=proxy)
self.helper = helper
self.state = ChanState()
@ -139,9 +139,9 @@ class ChanState:
conn.commit()
def publish_worker(queue: Queue, helper):
def publish_worker(queue: Queue, helper, p):
channel = connect()
web = Web(monitoring if MONITORING else None, rps=helper.rps)
web = Web(monitoring if MONITORING else None, rps=helper.rps, proxy=p)
while True:
try:
@ -203,6 +203,12 @@ if __name__ == "__main__":
chan = sys.argv[2]
chan_helper = CHANS[chan]
proxy = None
if len(sys.argv) > 3:
proxy = sys.argv[3]
logger.info("Using proxy %s" % proxy)
BYPASS_RPS = True
if BYPASS_RPS:
chan_helper.rps = 10
@ -212,10 +218,10 @@ if __name__ == "__main__":
publish_q = Queue()
for _ in range(5):
publish_thread = Thread(target=publish_worker, args=(publish_q, chan_helper))
publish_thread = Thread(target=publish_worker, args=(publish_q, chan_helper, proxy))
publish_thread.start()
s = ChanScanner(chan_helper)
s = ChanScanner(chan_helper, proxy)
while True:
for p, b in s.all_posts():
publish_q.put((p, b))

40
util.py
View File

@ -1,11 +1,14 @@
import logging
import sys
import time
import traceback
from datetime import datetime
from logging import FileHandler, StreamHandler
import requests
from hexlib.misc import rate_limit
from urllib3 import disable_warnings
disable_warnings()
last_time_called = dict()
@ -21,35 +24,26 @@ logger.addHandler(file_handler)
logger.addHandler(StreamHandler(sys.stdout))
def rate_limit(per_second):
min_interval = 1.0 / float(per_second)
def decorate(func):
last_time_called[func] = 0
def wrapper(*args, **kwargs):
elapsed = time.perf_counter() - last_time_called[func]
wait_time = min_interval - elapsed
if wait_time > 0:
time.sleep(wait_time)
last_time_called[func] = time.perf_counter()
return func(*args, **kwargs)
return wrapper
return decorate
class Web:
def __init__(self, monitoring, rps=1/2):
def __init__(self, monitoring, rps=1 / 2, proxy=None):
self.session = requests.Session()
if proxy:
self.session.proxies = {"http": proxy, "https": proxy}
self.session.verify = False
self._rps = rps
self.monitoring = monitoring
@rate_limit(self._rps)
def _get(url, **kwargs):
return self.session.get(url, **kwargs)
retries = 3
while retries > 0:
retries -= 1
try:
return self.session.get(url, **kwargs)
except Exception as e:
logger.warning("Error with request %s: %s" % (url, str(e)))
raise Exception("Gave up request after maximum number of retries")
self._get = _get