mirror of
https://github.com/simon987/chan_feed.git
synced 2025-04-20 10:36:43 +00:00
Optional proxy, small refactor
This commit is contained in:
parent
b6425713a8
commit
65712e2ca1
@ -7,7 +7,7 @@ image boards and publishes serialised JSON to RabbitMQ
|
|||||||
Compatible image boards: 4chan, lainchan, uboachan,
|
Compatible image boards: 4chan, lainchan, uboachan,
|
||||||
22chan, wizchan, 1chan, 2ch.hk, endchan, 38chan, alokal,
|
22chan, wizchan, 1chan, 2ch.hk, endchan, 38chan, alokal,
|
||||||
horochan, doushio, desuchan, tgchan, lolnada, 7chan, chanon,
|
horochan, doushio, desuchan, tgchan, lolnada, 7chan, chanon,
|
||||||
chan.org.li.
|
chan.org.li, hispachan, 8kun, nowere, iichan and more.
|
||||||
|
|
||||||
Can optionally push monitoring data to InfluxDB. Below is an
|
Can optionally push monitoring data to InfluxDB. Below is an
|
||||||
example of Grafana being used to display it.
|
example of Grafana being used to display it.
|
||||||
|
BIN
monitoring.png
BIN
monitoring.png
Binary file not shown.
Before Width: | Height: | Size: 87 KiB After Width: | Height: | Size: 366 KiB |
@ -1,17 +1,15 @@
|
|||||||
import base64
|
|
||||||
import hashlib
|
import hashlib
|
||||||
import re
|
|
||||||
import zlib
|
import zlib
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
import imagehash
|
import imagehash
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
from hexlib.imhash import b64hash
|
||||||
|
|
||||||
from util import logger
|
from util import logger
|
||||||
|
|
||||||
LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))")
|
from hexlib.regex import HTML_HREF_RE, LINK_RE
|
||||||
HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"")
|
|
||||||
|
|
||||||
IMAGE_FILETYPES = (
|
IMAGE_FILETYPES = (
|
||||||
# :orig for twitter cdn
|
# :orig for twitter cdn
|
||||||
@ -33,12 +31,6 @@ def _is_image(url):
|
|||||||
return url.lower().endswith(IMAGE_FILETYPES)
|
return url.lower().endswith(IMAGE_FILETYPES)
|
||||||
|
|
||||||
|
|
||||||
def b64hash(imhash, bcount):
|
|
||||||
return base64.b64encode(
|
|
||||||
sum(1 << i for i, b in enumerate(imhash.hash.flatten()) if b).to_bytes(bcount, "big")
|
|
||||||
).decode("ascii")
|
|
||||||
|
|
||||||
|
|
||||||
def image_meta(url, url_idx, web):
|
def image_meta(url, url_idx, web):
|
||||||
r = web.get(url)
|
r = web.get(url)
|
||||||
if not r:
|
if not r:
|
||||||
|
@ -6,3 +6,5 @@ stem
|
|||||||
influxdb
|
influxdb
|
||||||
pika
|
pika
|
||||||
bs4
|
bs4
|
||||||
|
urllib3
|
||||||
|
git+git://github.com/simon987/hexlib.git
|
18
run.py
18
run.py
@ -19,8 +19,8 @@ BYPASS_RPS = False
|
|||||||
|
|
||||||
|
|
||||||
class ChanScanner:
|
class ChanScanner:
|
||||||
def __init__(self, helper):
|
def __init__(self, helper, proxy):
|
||||||
self.web = Web(monitoring if MONITORING else None, rps=helper.rps)
|
self.web = Web(monitoring if MONITORING else None, rps=helper.rps, proxy=proxy)
|
||||||
self.helper = helper
|
self.helper = helper
|
||||||
self.state = ChanState()
|
self.state = ChanState()
|
||||||
|
|
||||||
@ -139,9 +139,9 @@ class ChanState:
|
|||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
def publish_worker(queue: Queue, helper):
|
def publish_worker(queue: Queue, helper, p):
|
||||||
channel = connect()
|
channel = connect()
|
||||||
web = Web(monitoring if MONITORING else None, rps=helper.rps)
|
web = Web(monitoring if MONITORING else None, rps=helper.rps, proxy=p)
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
@ -203,6 +203,12 @@ if __name__ == "__main__":
|
|||||||
chan = sys.argv[2]
|
chan = sys.argv[2]
|
||||||
chan_helper = CHANS[chan]
|
chan_helper = CHANS[chan]
|
||||||
|
|
||||||
|
proxy = None
|
||||||
|
if len(sys.argv) > 3:
|
||||||
|
proxy = sys.argv[3]
|
||||||
|
logger.info("Using proxy %s" % proxy)
|
||||||
|
|
||||||
|
BYPASS_RPS = True
|
||||||
if BYPASS_RPS:
|
if BYPASS_RPS:
|
||||||
chan_helper.rps = 10
|
chan_helper.rps = 10
|
||||||
|
|
||||||
@ -212,10 +218,10 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
publish_q = Queue()
|
publish_q = Queue()
|
||||||
for _ in range(5):
|
for _ in range(5):
|
||||||
publish_thread = Thread(target=publish_worker, args=(publish_q, chan_helper))
|
publish_thread = Thread(target=publish_worker, args=(publish_q, chan_helper, proxy))
|
||||||
publish_thread.start()
|
publish_thread.start()
|
||||||
|
|
||||||
s = ChanScanner(chan_helper)
|
s = ChanScanner(chan_helper, proxy)
|
||||||
while True:
|
while True:
|
||||||
for p, b in s.all_posts():
|
for p, b in s.all_posts():
|
||||||
publish_q.put((p, b))
|
publish_q.put((p, b))
|
||||||
|
38
util.py
38
util.py
@ -1,11 +1,14 @@
|
|||||||
import logging
|
import logging
|
||||||
import sys
|
import sys
|
||||||
import time
|
|
||||||
import traceback
|
import traceback
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from logging import FileHandler, StreamHandler
|
from logging import FileHandler, StreamHandler
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
from hexlib.misc import rate_limit
|
||||||
|
from urllib3 import disable_warnings
|
||||||
|
|
||||||
|
disable_warnings()
|
||||||
|
|
||||||
last_time_called = dict()
|
last_time_called = dict()
|
||||||
|
|
||||||
@ -21,35 +24,26 @@ logger.addHandler(file_handler)
|
|||||||
logger.addHandler(StreamHandler(sys.stdout))
|
logger.addHandler(StreamHandler(sys.stdout))
|
||||||
|
|
||||||
|
|
||||||
def rate_limit(per_second):
|
|
||||||
min_interval = 1.0 / float(per_second)
|
|
||||||
|
|
||||||
def decorate(func):
|
|
||||||
last_time_called[func] = 0
|
|
||||||
|
|
||||||
def wrapper(*args, **kwargs):
|
|
||||||
elapsed = time.perf_counter() - last_time_called[func]
|
|
||||||
wait_time = min_interval - elapsed
|
|
||||||
if wait_time > 0:
|
|
||||||
time.sleep(wait_time)
|
|
||||||
|
|
||||||
last_time_called[func] = time.perf_counter()
|
|
||||||
return func(*args, **kwargs)
|
|
||||||
|
|
||||||
return wrapper
|
|
||||||
|
|
||||||
return decorate
|
|
||||||
|
|
||||||
|
|
||||||
class Web:
|
class Web:
|
||||||
def __init__(self, monitoring, rps=1/2):
|
def __init__(self, monitoring, rps=1 / 2, proxy=None):
|
||||||
self.session = requests.Session()
|
self.session = requests.Session()
|
||||||
|
if proxy:
|
||||||
|
self.session.proxies = {"http": proxy, "https": proxy}
|
||||||
|
self.session.verify = False
|
||||||
self._rps = rps
|
self._rps = rps
|
||||||
self.monitoring = monitoring
|
self.monitoring = monitoring
|
||||||
|
|
||||||
@rate_limit(self._rps)
|
@rate_limit(self._rps)
|
||||||
def _get(url, **kwargs):
|
def _get(url, **kwargs):
|
||||||
|
retries = 3
|
||||||
|
|
||||||
|
while retries > 0:
|
||||||
|
retries -= 1
|
||||||
|
try:
|
||||||
return self.session.get(url, **kwargs)
|
return self.session.get(url, **kwargs)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Error with request %s: %s" % (url, str(e)))
|
||||||
|
raise Exception("Gave up request after maximum number of retries")
|
||||||
|
|
||||||
self._get = _get
|
self._get = _get
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user