mirror of
https://github.com/simon987/chan_feed.git
synced 2025-04-10 14:06:42 +00:00
Optional proxy, small refactor
This commit is contained in:
parent
b6425713a8
commit
65712e2ca1
@ -7,7 +7,7 @@ image boards and publishes serialised JSON to RabbitMQ
|
||||
Compatible image boards: 4chan, lainchan, uboachan,
|
||||
22chan, wizchan, 1chan, 2ch.hk, endchan, 38chan, alokal,
|
||||
horochan, doushio, desuchan, tgchan, lolnada, 7chan, chanon,
|
||||
chan.org.li.
|
||||
chan.org.li, hispachan, 8kun, nowere, iichan and more.
|
||||
|
||||
Can optionally push monitoring data to InfluxDB. Below is an
|
||||
example of Grafana being used to display it.
|
||||
|
BIN
monitoring.png
BIN
monitoring.png
Binary file not shown.
Before Width: | Height: | Size: 87 KiB After Width: | Height: | Size: 366 KiB |
@ -1,17 +1,15 @@
|
||||
import base64
|
||||
import hashlib
|
||||
import re
|
||||
import zlib
|
||||
from io import BytesIO
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import imagehash
|
||||
from PIL import Image
|
||||
from hexlib.imhash import b64hash
|
||||
|
||||
from util import logger
|
||||
|
||||
LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))")
|
||||
HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"")
|
||||
from hexlib.regex import HTML_HREF_RE, LINK_RE
|
||||
|
||||
IMAGE_FILETYPES = (
|
||||
# :orig for twitter cdn
|
||||
@ -33,12 +31,6 @@ def _is_image(url):
|
||||
return url.lower().endswith(IMAGE_FILETYPES)
|
||||
|
||||
|
||||
def b64hash(imhash, bcount):
|
||||
return base64.b64encode(
|
||||
sum(1 << i for i, b in enumerate(imhash.hash.flatten()) if b).to_bytes(bcount, "big")
|
||||
).decode("ascii")
|
||||
|
||||
|
||||
def image_meta(url, url_idx, web):
|
||||
r = web.get(url)
|
||||
if not r:
|
||||
|
@ -5,4 +5,6 @@ requests[socks]
|
||||
stem
|
||||
influxdb
|
||||
pika
|
||||
bs4
|
||||
bs4
|
||||
urllib3
|
||||
git+git://github.com/simon987/hexlib.git
|
18
run.py
18
run.py
@ -19,8 +19,8 @@ BYPASS_RPS = False
|
||||
|
||||
|
||||
class ChanScanner:
|
||||
def __init__(self, helper):
|
||||
self.web = Web(monitoring if MONITORING else None, rps=helper.rps)
|
||||
def __init__(self, helper, proxy):
|
||||
self.web = Web(monitoring if MONITORING else None, rps=helper.rps, proxy=proxy)
|
||||
self.helper = helper
|
||||
self.state = ChanState()
|
||||
|
||||
@ -139,9 +139,9 @@ class ChanState:
|
||||
conn.commit()
|
||||
|
||||
|
||||
def publish_worker(queue: Queue, helper):
|
||||
def publish_worker(queue: Queue, helper, p):
|
||||
channel = connect()
|
||||
web = Web(monitoring if MONITORING else None, rps=helper.rps)
|
||||
web = Web(monitoring if MONITORING else None, rps=helper.rps, proxy=p)
|
||||
|
||||
while True:
|
||||
try:
|
||||
@ -203,6 +203,12 @@ if __name__ == "__main__":
|
||||
chan = sys.argv[2]
|
||||
chan_helper = CHANS[chan]
|
||||
|
||||
proxy = None
|
||||
if len(sys.argv) > 3:
|
||||
proxy = sys.argv[3]
|
||||
logger.info("Using proxy %s" % proxy)
|
||||
|
||||
BYPASS_RPS = True
|
||||
if BYPASS_RPS:
|
||||
chan_helper.rps = 10
|
||||
|
||||
@ -212,10 +218,10 @@ if __name__ == "__main__":
|
||||
|
||||
publish_q = Queue()
|
||||
for _ in range(5):
|
||||
publish_thread = Thread(target=publish_worker, args=(publish_q, chan_helper))
|
||||
publish_thread = Thread(target=publish_worker, args=(publish_q, chan_helper, proxy))
|
||||
publish_thread.start()
|
||||
|
||||
s = ChanScanner(chan_helper)
|
||||
s = ChanScanner(chan_helper, proxy)
|
||||
while True:
|
||||
for p, b in s.all_posts():
|
||||
publish_q.put((p, b))
|
||||
|
40
util.py
40
util.py
@ -1,11 +1,14 @@
|
||||
import logging
|
||||
import sys
|
||||
import time
|
||||
import traceback
|
||||
from datetime import datetime
|
||||
from logging import FileHandler, StreamHandler
|
||||
|
||||
import requests
|
||||
from hexlib.misc import rate_limit
|
||||
from urllib3 import disable_warnings
|
||||
|
||||
disable_warnings()
|
||||
|
||||
last_time_called = dict()
|
||||
|
||||
@ -21,35 +24,26 @@ logger.addHandler(file_handler)
|
||||
logger.addHandler(StreamHandler(sys.stdout))
|
||||
|
||||
|
||||
def rate_limit(per_second):
|
||||
min_interval = 1.0 / float(per_second)
|
||||
|
||||
def decorate(func):
|
||||
last_time_called[func] = 0
|
||||
|
||||
def wrapper(*args, **kwargs):
|
||||
elapsed = time.perf_counter() - last_time_called[func]
|
||||
wait_time = min_interval - elapsed
|
||||
if wait_time > 0:
|
||||
time.sleep(wait_time)
|
||||
|
||||
last_time_called[func] = time.perf_counter()
|
||||
return func(*args, **kwargs)
|
||||
|
||||
return wrapper
|
||||
|
||||
return decorate
|
||||
|
||||
|
||||
class Web:
|
||||
def __init__(self, monitoring, rps=1/2):
|
||||
def __init__(self, monitoring, rps=1 / 2, proxy=None):
|
||||
self.session = requests.Session()
|
||||
if proxy:
|
||||
self.session.proxies = {"http": proxy, "https": proxy}
|
||||
self.session.verify = False
|
||||
self._rps = rps
|
||||
self.monitoring = monitoring
|
||||
|
||||
@rate_limit(self._rps)
|
||||
def _get(url, **kwargs):
|
||||
return self.session.get(url, **kwargs)
|
||||
retries = 3
|
||||
|
||||
while retries > 0:
|
||||
retries -= 1
|
||||
try:
|
||||
return self.session.get(url, **kwargs)
|
||||
except Exception as e:
|
||||
logger.warning("Error with request %s: %s" % (url, str(e)))
|
||||
raise Exception("Gave up request after maximum number of retries")
|
||||
|
||||
self._get = _get
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user