Update hexlib version, refactor

This commit is contained in:
simon987 2021-03-07 14:06:58 -05:00
parent 3dda97e180
commit 3fa2d90486
5 changed files with 17 additions and 78 deletions

View File

@ -5,7 +5,7 @@ services:
image: simon987/gabtv_feed image: simon987/gabtv_feed
restart: always restart: always
environment: environment:
- "GTV_REDIS_HOST=" - "REDIS_HOST="
- "GTV_RPS=0.10" - "RPS=0.10"
- "GTV_MAX_PAGES=9999999" - "GTV_MAX_PAGES=9999999"
- "GTV_RECRAWL_HOURS=8" - "GTV_RECRAWL_HOURS=8"

View File

@ -3,8 +3,10 @@ import time
import os import os
from time import sleep from time import sleep
from hexlib.env import get_web
from hexlib.log import logger
from state import GabTvState from state import GabTvState
from util import Web, logger
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import json import json
@ -73,9 +75,9 @@ def parse_channel_episode_list(channel_id, r):
class GabTvScanner: class GabTvScanner:
def __init__(self, state: GabTvState, rps): def __init__(self, state: GabTvState):
self._state = state self._state = state
self._web = Web(rps) self._web = get_web()
def episodes_of_channel(self, channel_id): def episodes_of_channel(self, channel_id):
if not self._state.has_visited_channel(channel_id): if not self._state.has_visited_channel(channel_id):

26
run.py
View File

@ -1,22 +1,14 @@
import json import json
import os
import traceback import traceback
from queue import Queue from queue import Queue
from threading import Thread from threading import Thread
import redis from hexlib.env import get_redis
from hexlib.log import logger
from gabtv import GabTvScanner, item_type from gabtv import GabTvScanner, item_type
from post_process import post_process from post_process import post_process
from state import GabTvState from state import GabTvState
from util import logger
REDIS_HOST = os.environ.get("GTV_REDIS_HOST", "localhost")
REDIS_PORT = os.environ.get("GTV_REDIS_PORT", 6379)
PF_PUBLISH = os.environ.get("GTV_PUBLISH", False)
GTV_RPS = os.environ.get("GTV_RPS", 1)
ARC_LISTS = os.environ.get("GTV_ARC_LISTS", "arc").split(",")
def publish_worker(queue: Queue): def publish_worker(queue: Queue):
@ -50,23 +42,20 @@ def publish(item):
routing_key = "%s.x" % (itm_type,) routing_key = "%s.x" % (itm_type,)
message = json.dumps(item, separators=(',', ':'), ensure_ascii=False, sort_keys=True) message = json.dumps(item, separators=(',', ':'), ensure_ascii=False, sort_keys=True)
if PF_PUBLISH: rdb.lpush("arc.gtv." + routing_key, message)
rdb.publish("gtv." + routing_key, message)
for arc in ARC_LISTS:
rdb.lpush(arc + ".gtv." + routing_key, message)
if __name__ == "__main__": if __name__ == "__main__":
state = GabTvState("gtv", REDIS_HOST, REDIS_PORT) state = GabTvState("gtv")
rdb = redis.Redis(host=REDIS_HOST, port=REDIS_PORT) rdb = get_redis()
publish_q = Queue() publish_q = Queue()
publish_thread = Thread(target=publish_worker, args=(publish_q,)) publish_thread = Thread(target=publish_worker, args=(publish_q,))
publish_thread.setDaemon(True) publish_thread.setDaemon(True)
publish_thread.start() publish_thread.start()
s = GabTvScanner(state, GTV_RPS) s = GabTvScanner(state)
while True: while True:
try: try:
@ -74,6 +63,5 @@ if __name__ == "__main__":
publish_q.put(item) publish_q.put(item)
except KeyboardInterrupt as e: except KeyboardInterrupt as e:
print("cleanup..") print("cleanup..")
for _ in range(3): publish_q.put(None)
publish_q.put(None)
break break

View File

@ -3,15 +3,13 @@ import os
from hexlib.db import VolatileState, VolatileBooleanState from hexlib.db import VolatileState, VolatileBooleanState
from util import logger
RECRAWL_HOURS = int(os.environ.get("GTV_RECRAWL_HOURS", 8)) RECRAWL_HOURS = int(os.environ.get("GTV_RECRAWL_HOURS", 8))
class GabTvState: class GabTvState:
def __init__(self, prefix, host, port): def __init__(self, prefix):
self._episodes = VolatileState(prefix, host=host, port=port) self._episodes = VolatileState(prefix)
self._visited = VolatileBooleanState(prefix, host=host, port=port) self._visited = VolatileBooleanState(prefix)
def has_visited(self, item_id): def has_visited(self, item_id):
return self._visited["byid"][item_id] return self._visited["byid"][item_id]

49
util.py
View File

@ -1,49 +0,0 @@
import logging
import sys
import traceback
from logging import StreamHandler
import requests
from hexlib.misc import rate_limit
logger = logging.getLogger("default")
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s %(levelname)-5s %(message)s')
for h in logger.handlers:
logger.removeHandler(h)
logger.addHandler(StreamHandler(sys.stdout))
class Web:
def __init__(self, rps):
self.session = requests.Session()
self._rps = rps
@rate_limit(self._rps)
def _get(url, **kwargs):
retries = 3
while retries > 0:
retries -= 1
try:
return self.session.get(url, **kwargs)
except KeyboardInterrupt as e:
raise e
except Exception as e:
logger.warning("Error with request %s: %s" % (url, str(e)))
raise Exception("Gave up request after maximum number of retries")
self._get = _get
def get(self, url, **kwargs):
try:
r = self._get(url, **kwargs)
logger.debug("GET %s <%d>" % (url, r.status_code))
return r
except KeyboardInterrupt as e:
raise e
except Exception as e:
logger.error(str(e) + traceback.format_exc())
return None