Compare commits

..

No commits in common. "446417f212976fc5d3213276c3a3e9b91fab44df" and "3dda97e1808d0244d1d59096c07a8e91773fdba2" have entirely different histories.

6 changed files with 79 additions and 19 deletions

View File

@ -5,7 +5,7 @@ services:
image: simon987/gabtv_feed image: simon987/gabtv_feed
restart: always restart: always
environment: environment:
- "REDIS_HOST=" - "GTV_REDIS_HOST="
- "RPS=0.10" - "GTV_RPS=0.10"
- "GTV_MAX_PAGES=9999999" - "GTV_MAX_PAGES=9999999"
- "GTV_RECRAWL_HOURS=8" - "GTV_RECRAWL_HOURS=8"

View File

@ -3,10 +3,8 @@ import time
import os import os
from time import sleep from time import sleep
from hexlib.env import get_web
from hexlib.log import logger
from state import GabTvState from state import GabTvState
from util import Web, logger
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import json import json
@ -75,9 +73,9 @@ def parse_channel_episode_list(channel_id, r):
class GabTvScanner: class GabTvScanner:
def __init__(self, state: GabTvState): def __init__(self, state: GabTvState, rps):
self._state = state self._state = state
self._web = get_web() self._web = Web(rps)
def episodes_of_channel(self, channel_id): def episodes_of_channel(self, channel_id):
if not self._state.has_visited_channel(channel_id): if not self._state.has_visited_channel(channel_id):

View File

@ -1,5 +1,5 @@
requests requests
urllib3 urllib3
redis
git+git://github.com/simon987/hexlib.git git+git://github.com/simon987/hexlib.git
redis
bs4 bs4

24
run.py
View File

@ -1,14 +1,22 @@
import json import json
import os
import traceback import traceback
from queue import Queue from queue import Queue
from threading import Thread from threading import Thread
from hexlib.env import get_redis import redis
from hexlib.log import logger
from gabtv import GabTvScanner, item_type from gabtv import GabTvScanner, item_type
from post_process import post_process from post_process import post_process
from state import GabTvState from state import GabTvState
from util import logger
REDIS_HOST = os.environ.get("GTV_REDIS_HOST", "localhost")
REDIS_PORT = os.environ.get("GTV_REDIS_PORT", 6379)
PF_PUBLISH = os.environ.get("GTV_PUBLISH", False)
GTV_RPS = os.environ.get("GTV_RPS", 1)
ARC_LISTS = os.environ.get("GTV_ARC_LISTS", "arc").split(",")
def publish_worker(queue: Queue): def publish_worker(queue: Queue):
@ -42,20 +50,23 @@ def publish(item):
routing_key = "%s.x" % (itm_type,) routing_key = "%s.x" % (itm_type,)
message = json.dumps(item, separators=(',', ':'), ensure_ascii=False, sort_keys=True) message = json.dumps(item, separators=(',', ':'), ensure_ascii=False, sort_keys=True)
rdb.lpush("arc.gtv." + routing_key, message) if PF_PUBLISH:
rdb.publish("gtv." + routing_key, message)
for arc in ARC_LISTS:
rdb.lpush(arc + ".gtv." + routing_key, message)
if __name__ == "__main__": if __name__ == "__main__":
state = GabTvState("gtv") state = GabTvState("gtv", REDIS_HOST, REDIS_PORT)
rdb = get_redis() rdb = redis.Redis(host=REDIS_HOST, port=REDIS_PORT)
publish_q = Queue() publish_q = Queue()
publish_thread = Thread(target=publish_worker, args=(publish_q,)) publish_thread = Thread(target=publish_worker, args=(publish_q,))
publish_thread.setDaemon(True) publish_thread.setDaemon(True)
publish_thread.start() publish_thread.start()
s = GabTvScanner(state) s = GabTvScanner(state, GTV_RPS)
while True: while True:
try: try:
@ -63,5 +74,6 @@ if __name__ == "__main__":
publish_q.put(item) publish_q.put(item)
except KeyboardInterrupt as e: except KeyboardInterrupt as e:
print("cleanup..") print("cleanup..")
for _ in range(3):
publish_q.put(None) publish_q.put(None)
break break

View File

@ -2,15 +2,16 @@ from time import time
import os import os
from hexlib.db import VolatileState, VolatileBooleanState from hexlib.db import VolatileState, VolatileBooleanState
from hexlib.log import logger
from util import logger
RECRAWL_HOURS = int(os.environ.get("GTV_RECRAWL_HOURS", 8)) RECRAWL_HOURS = int(os.environ.get("GTV_RECRAWL_HOURS", 8))
class GabTvState: class GabTvState:
def __init__(self, prefix): def __init__(self, prefix, host, port):
self._episodes = VolatileState(prefix) self._episodes = VolatileState(prefix, host=host, port=port)
self._visited = VolatileBooleanState(prefix) self._visited = VolatileBooleanState(prefix, host=host, port=port)
def has_visited(self, item_id): def has_visited(self, item_id):
return self._visited["byid"][item_id] return self._visited["byid"][item_id]

49
util.py Normal file
View File

@ -0,0 +1,49 @@
import logging
import sys
import traceback
from logging import StreamHandler
import requests
from hexlib.misc import rate_limit
logger = logging.getLogger("default")
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s %(levelname)-5s %(message)s')
for h in logger.handlers:
logger.removeHandler(h)
logger.addHandler(StreamHandler(sys.stdout))
class Web:
def __init__(self, rps):
self.session = requests.Session()
self._rps = rps
@rate_limit(self._rps)
def _get(url, **kwargs):
retries = 3
while retries > 0:
retries -= 1
try:
return self.session.get(url, **kwargs)
except KeyboardInterrupt as e:
raise e
except Exception as e:
logger.warning("Error with request %s: %s" % (url, str(e)))
raise Exception("Gave up request after maximum number of retries")
self._get = _get
def get(self, url, **kwargs):
try:
r = self._get(url, **kwargs)
logger.debug("GET %s <%d>" % (url, r.status_code))
return r
except KeyboardInterrupt as e:
raise e
except Exception as e:
logger.error(str(e) + traceback.format_exc())
return None