diff --git a/docker-compose.yml b/docker-compose.yml index 60a0aa3..816261c 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -5,6 +5,7 @@ services: image: simon987/poal_feed restart: always environment: - - "PF_REDIS_HOST=" - - "PF_RPS=1" + - "REDIS_HOST=" + - "RPS=1" + - "USER_AGENT=firefox" - "PF_MAX_PAGE=300" diff --git a/poal.py b/poal.py index 463d5a4..5e98164 100644 --- a/poal.py +++ b/poal.py @@ -3,17 +3,18 @@ import os from json import JSONDecodeError from urllib.parse import urljoin +from hexlib.env import get_web +from hexlib.log import logger + from post_process import get_links_from_body from state import PoalState -from util import Web, logger PF_MAX_PAGE = int(os.environ.get("PF_MAX_PAGE", 9999999)) class PoalHelper: - def __init__(self, url, rps, boards): - self.rps = rps + def __init__(self, url, boards): self._boards = boards self._url = url @@ -107,7 +108,7 @@ class PoalScanner: def __init__(self, state: PoalState, helper: PoalHelper): self._state = state self._helper = helper - self._web = Web(rps=helper.rps) + self._web = get_web() def _posts(self, board): r = self._web.get(self._helper.posts_url(board)) diff --git a/requirements.txt b/requirements.txt index 86f3a42..ddbb911 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ requests urllib3 -git+git://github.com/simon987/hexlib.git redis +git+git://github.com/simon987/hexlib.git diff --git a/run.py b/run.py index 630d7d4..ec8c9f8 100644 --- a/run.py +++ b/run.py @@ -1,36 +1,18 @@ import json -import os -import traceback from queue import Queue from threading import Thread -import redis +from hexlib.concurrency import queue_iter +from hexlib.env import get_redis from poal import PoalScanner, PoalHelper from post_process import post_process from state import PoalState -from util import logger - -REDIS_HOST = os.environ.get("PF_REDIS_HOST", "localhost") -REDIS_PORT = os.environ.get("PF_REDIS_PORT", 6379) -PF_PUBLISH = os.environ.get("PF_PUBLISH", False) -PF_RPS = os.environ.get("PF_RPS", 1) - -ARC_LISTS = os.environ.get("PF_ARC_LISTS", "arc").split(",") def publish_worker(queue: Queue, helper): - while True: - try: - item, board = queue.get() - if item is None: - break - publish(item, board, helper) - - except Exception as e: - logger.error(str(e) + ": " + traceback.format_exc()) - finally: - queue.task_done() + for item, board in queue_iter(queue): + publish(item, board, helper) def once(func): @@ -50,10 +32,7 @@ def publish(item, board, helper): routing_key = "%s.%s" % (item_type, board) message = json.dumps(item, separators=(',', ':'), ensure_ascii=False, sort_keys=True) - if PF_PUBLISH: - rdb.publish("poal." + routing_key, message) - for arc in ARC_LISTS: - rdb.lpush(arc + ".poal." + routing_key, message) + rdb.lpush("arc.poal." + routing_key, message) HELPER = PoalHelper( @@ -61,20 +40,18 @@ HELPER = PoalHelper( "all", # TODO: Are there hidden boards that do not show up in /all ? ), - rps=PF_RPS, url="https://poal.co" ) if __name__ == "__main__": - state = PoalState("poal", REDIS_HOST, REDIS_PORT) - rdb = redis.Redis(host=REDIS_HOST, port=REDIS_PORT) + state = PoalState("poal") + rdb = get_redis() publish_q = Queue() - for _ in range(3): - publish_thread = Thread(target=publish_worker, args=(publish_q, HELPER)) - publish_thread.setDaemon(True) - publish_thread.start() + publish_thread = Thread(target=publish_worker, args=(publish_q, HELPER)) + publish_thread.setDaemon(True) + publish_thread.start() s = PoalScanner(state, HELPER) while True: @@ -83,6 +60,5 @@ if __name__ == "__main__": publish_q.put((item, board)) except KeyboardInterrupt as e: print("cleanup..") - for _ in range(3): - publish_q.put((None, None)) + publish_q.put((None, None)) break diff --git a/state.py b/state.py index 8f08cc8..9ad8b12 100644 --- a/state.py +++ b/state.py @@ -2,10 +2,10 @@ from hexlib.db import VolatileState, VolatileBooleanState class PoalState: - def __init__(self, prefix, host, port): - self._posts = VolatileState(prefix, host=host, port=port) - self._comments = VolatileBooleanState(prefix, host=host, port=port) - self._users = VolatileBooleanState(prefix, host=host, port=port) + def __init__(self, prefix): + self._posts = VolatileState(prefix) + self._comments = VolatileBooleanState(prefix) + self._users = VolatileBooleanState(prefix) def has_visited(self, item_id): return self._comments["comments"][item_id] diff --git a/util.py b/util.py deleted file mode 100644 index e6d8d70..0000000 --- a/util.py +++ /dev/null @@ -1,58 +0,0 @@ -import logging -import sys -import traceback -from logging import StreamHandler - -import requests -from hexlib.misc import rate_limit - -logger = logging.getLogger("default") -logger.setLevel(logging.DEBUG) - -formatter = logging.Formatter('%(asctime)s %(levelname)-5s %(message)s') -for h in logger.handlers: - logger.removeHandler(h) -logger.addHandler(StreamHandler(sys.stdout)) - -UA = "User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:83.0) Gecko/20100101 Firefox/83.0" - - -class Web: - def __init__(self, rps): - self.session = requests.Session() - self._rps = rps - - @rate_limit(self._rps) - def _get(url, **kwargs): - if "headers" in kwargs: - kwargs["headers"]["User-Agent"] = UA - else: - kwargs["headers"] = {"User-Agent": UA} - retries = 8 - - while retries > 0: - retries -= 1 - try: - r = self.session.get(url, **kwargs) - if r.status_code == 500: - raise Exception("Server error") - return r - except KeyboardInterrupt as e: - raise e - except Exception as e: - logger.warning("Error with request %s: %s" % (url, str(e))) - raise Exception("Gave up request after maximum number of retries") - - self._get = _get - - def get(self, url, **kwargs): - try: - r = self._get(url, **kwargs) - - logger.debug("GET %s <%d>" % (url, r.status_code)) - return r - except KeyboardInterrupt as e: - raise e - except Exception as e: - logger.error(str(e) + traceback.format_exc()) - return None