Initial commit

This commit is contained in:
simon987 2020-12-25 20:11:46 -05:00
commit 2f8cbca0b2
9 changed files with 359 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
.idea/
*.pyc

11
Dockerfile Normal file
View File

@ -0,0 +1,11 @@
FROM python:3.8
ADD requirements.txt /requirements.txt
RUN pip install --no-cache-dir -r requirements.txt
COPY . /app
RUN chmod 777 -R /app
WORKDIR /app
ENTRYPOINT ["python", "run.py"]

10
docker-compose.yml Normal file
View File

@ -0,0 +1,10 @@
version: "3"
services:
scraper:
image: simon987/gabtv_feed
restart: always
environment:
- "GTV_REDIS_HOST="
- "GTV_RPS=0.10"
- "GTV_MAX_PAGES=9999999"

164
gabtv.py Normal file
View File

@ -0,0 +1,164 @@
from json import JSONDecodeError
import time
import os
from time import sleep
from state import GabTvState
from util import Web, logger
from bs4 import BeautifulSoup
import json
PER_PAGE = 5
MAX_PAGES = int(os.environ.get("GTV_MAX_PAGES", 99999999))
def item_type(item):
if "author" in item:
return "comment"
if "category" in item:
return "episode"
if "moderators" in item:
return "channel"
def episode_url(page, cpp=PER_PAGE):
return "https://tv.gab.com/api/v1/episode?cpp=%d&p=%d" % (cpp, page,)
def increment_episode_url(url):
tokens = url.rsplit("=", 1)
return "=".join((
tokens[0],
str(int(tokens[1]) + 1)
))
def comments_url(episode):
return f"https://tv.gab.com/channel/{episode['channel']['slug']}/view/{episode['slug']}"
def channel_url(channel_id, page=1):
return f"https://tv.gab.com/api/v1/channel/{channel_id}/episode?p={page}"
def parse_episode_list(r):
try:
j = json.loads(r.content.decode('utf-8', 'ignore'))
except JSONDecodeError:
logger.warning("JSONDecodeError for %s:" % (r.url,))
logger.warning(r.text)
return [], None
episodes = j["episodes"]
page = j["pagination"]["p"]
if len(episodes) == PER_PAGE and page + 1 < MAX_PAGES:
return episodes, episode_url(page=page + 1)
return episodes, None
def parse_channel_episode_list(channel_id, r):
try:
j = json.loads(r.content.decode('utf-8', 'ignore'))
except JSONDecodeError:
logger.warning("JSONDecodeError for %s:" % (r.url,))
logger.warning(r.text)
return [], None
episodes = j["episodes"]
if len(episodes) == PER_PAGE:
page = j["pagination"]["p"]
return episodes, channel_url(channel_id, page=page + 1)
return episodes, None
class GabTvScanner:
def __init__(self, state: GabTvState, rps):
self._state = state
self._web = Web(rps)
def episodes_of_channel(self, channel_id):
if not self._state.has_visited_channel(channel_id):
r = self._web.get(channel_url(channel_id))
while True:
episodes, next_url = parse_channel_episode_list(channel_id, r)
for episode in episodes:
yield episode
self._state.mark_visited_channel(channel_id)
if not next_url:
break
r = self._web.get(next_url)
if not r or r.status_code != 200:
break
def episodes(self):
r = self._web.get(episode_url(page=1))
# TODO: This is sometimes broken for no reason on page=1 (?!)
if r.status_code == 500:
sleep(30)
return []
skips = 15
while True:
episodes, next_url = parse_episode_list(r)
for episode in episodes:
yield episode
# Also crawl channel list
# Looks like only a
channel_id = episode["channel"]["_id"]
for channel_ep in self.episodes_of_channel(channel_id):
yield channel_ep
if not next_url:
break
r = self._web.get(next_url)
# Some pages are broken, attempt to skip it once
while r.status_code == 500 and skips > 0:
logger.info("Skipped page!")
next_url = increment_episode_url(next_url)
r = self._web.get(next_url)
skips -= 1
if not r or r.status_code != 200:
break
skips = 15
def fetch_random_episode_ids(self):
r = self._web.get("https://tv.gab.com/search")
if not r or r.status_code != 200:
return []
def fetch_comments(self, episode):
r = self._web.get(comments_url(episode))
if not r or r.status_code != 200:
return []
soup = BeautifulSoup(r.content, "html.parser")
for com_el in soup.find_all("div", class_="tv-comment"):
yield {
"_id": com_el.find("div", class_="comment-content").get("data-comment-id"),
"author_display_name": com_el.find("div", class_="author-name").find("a").text,
"author": com_el.find("div", class_="author-name").find("a").get("href").split("/")[2],
"channel": episode["channel"]["_id"],
"episode": episode["_id"],
"_created_rel": int(time.time()),
"created": com_el.find("div", class_="created-date").text,
"content": com_el.find("div", class_="comment-content").text.strip(),
"upvotes": int(com_el.find("span", class_="upvote-label").text),
"downvotes": int(com_el.find("span", class_="downvote-label").text),
"replies": int(com_el.find_all("span", class_="icon-label")[-1].text),
"_raw": str(com_el)
}
def all_items(self):
for episode in self.episodes():
yield episode
yield episode["channel"]
if not self._state.has_visited_episode(episode):
for comment in self.fetch_comments(episode):
yield comment
self._state.mark_visited_episode(episode)

4
post_process.py Normal file
View File

@ -0,0 +1,4 @@
def post_process(item):
item["_v"] = 1.0
return item

5
requirements.txt Normal file
View File

@ -0,0 +1,5 @@
requests
urllib3
git+git://github.com/simon987/hexlib.git
redis
bs4

80
run.py Normal file
View File

@ -0,0 +1,80 @@
import json
import os
import traceback
from queue import Queue
from threading import Thread
import redis
from gabtv import GabTvScanner, item_type
from post_process import post_process
from state import GabTvState
from util import logger
REDIS_HOST = os.environ.get("GTV_REDIS_HOST", "localhost")
REDIS_PORT = os.environ.get("GTV_REDIS_PORT", 6379)
PF_PUBLISH = os.environ.get("GTV_PUBLISH", False)
GTV_RPS = os.environ.get("GTV_RPS", 1)
ARC_LISTS = os.environ.get("GTV_ARC_LISTS", "arc").split(",")
def publish_worker(queue: Queue):
while True:
try:
item = queue.get()
if item is None:
break
publish(item)
except Exception as e:
logger.error(str(e) + ": " + traceback.format_exc())
finally:
queue.task_done()
def once(func):
def wrapper(item):
if not state.has_visited(item["_id"]):
func(item)
state.mark_visited(item["_id"])
return wrapper
@once
def publish(item):
post_process(item)
itm_type = item_type(item)
routing_key = "%s.x" % (itm_type,)
message = json.dumps(item, separators=(',', ':'), ensure_ascii=False, sort_keys=True)
if PF_PUBLISH:
rdb.publish("gtv." + routing_key, message)
for arc in ARC_LISTS:
rdb.lpush(arc + ".gtv." + routing_key, message)
if __name__ == "__main__":
state = GabTvState("gtv", REDIS_HOST, REDIS_PORT)
rdb = redis.Redis(host=REDIS_HOST, port=REDIS_PORT)
publish_q = Queue()
for _ in range(3):
publish_thread = Thread(target=publish_worker, args=(publish_q,))
publish_thread.setDaemon(True)
publish_thread.start()
s = GabTvScanner(state, GTV_RPS)
while True:
try:
for item in s.all_items():
publish_q.put(item)
except KeyboardInterrupt as e:
print("cleanup..")
for _ in range(3):
publish_q.put(None)
break

34
state.py Normal file
View File

@ -0,0 +1,34 @@
from time import time
from hexlib.db import VolatileState, VolatileBooleanState
class GabTvState:
def __init__(self, prefix, host, port):
self._episodes = VolatileState(prefix, host=host, port=port)
self._visited = VolatileBooleanState(prefix, host=host, port=port)
def has_visited(self, item_id):
return self._visited["byid"][item_id]
def mark_visited(self, item_id):
self._visited["byid"][item_id] = True
def has_visited_episode(self, episode):
# TODO: This doesn't actually work because the 'stats' object never actually updates (?!?)
# if episode["stats"]["commentCount"] == 0:
# # No comments, don't need to visit
# return True
# com_count = self._episodes["episodes"][episode["_id"]]
# return not com_count or episode["stats"]["commentCount"] == com_count
last_visited = self._episodes["ep_ts"][episode["_id"]]
return last_visited and int(time()) - int(last_visited) <= 3600 * 24 * 3
def mark_visited_episode(self, episode):
self._episodes["ep_ts"][episode["_id"]] = int(time())
def has_visited_channel(self, channel_id):
return self._visited["channel"][channel_id]
def mark_visited_channel(self, item_id):
self._visited["channel"][item_id] = True

49
util.py Normal file
View File

@ -0,0 +1,49 @@
import logging
import sys
import traceback
from logging import StreamHandler
import requests
from hexlib.misc import rate_limit
logger = logging.getLogger("default")
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s %(levelname)-5s %(message)s')
for h in logger.handlers:
logger.removeHandler(h)
logger.addHandler(StreamHandler(sys.stdout))
class Web:
def __init__(self, rps):
self.session = requests.Session()
self._rps = rps
@rate_limit(self._rps)
def _get(url, **kwargs):
retries = 3
while retries > 0:
retries -= 1
try:
return self.session.get(url, **kwargs)
except KeyboardInterrupt as e:
raise e
except Exception as e:
logger.warning("Error with request %s: %s" % (url, str(e)))
raise Exception("Gave up request after maximum number of retries")
self._get = _get
def get(self, url, **kwargs):
try:
r = self._get(url, **kwargs)
logger.debug("GET %s <%d>" % (url, r.status_code))
return r
except KeyboardInterrupt as e:
raise e
except Exception as e:
logger.error(str(e) + traceback.format_exc())
return None