diff --git a/docker-compose.yml b/docker-compose.yml index 816261c..93a0fa1 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -8,4 +8,4 @@ services: - "REDIS_HOST=" - "RPS=1" - "USER_AGENT=firefox" - - "PF_MAX_PAGE=300" + - "PROXY=" diff --git a/poal.py b/poal.py index 5e98164..e18cab8 100644 --- a/poal.py +++ b/poal.py @@ -1,185 +1,92 @@ -import json -import os -from json import JSONDecodeError -from urllib.parse import urljoin - +from bs4 import BeautifulSoup from hexlib.env import get_web -from hexlib.log import logger -from post_process import get_links_from_body from state import PoalState -PF_MAX_PAGE = int(os.environ.get("PF_MAX_PAGE", 9999999)) - - -class PoalHelper: - - def __init__(self, url, boards): - self._boards = boards - self._url = url - - def item_unique_id(self, item): - item_type = self.item_type(item) - if item_type == "post": - return item["pid"] - if item_type == "comment": - return item["cid"] - - return item["uid"] - - def item_urls(self, item): - item_type = self.item_type(item) - if item_type == "post": - urls = [ - item["link"], - *(get_links_from_body(item["content"]) if item["content"] else []) - ] - if item["thumbnail"]: - urls.append(urljoin("https://poal.co/static/thumbs/", item["thumbnail"])) - return urls - return get_links_from_body(item["content"]) if item["content"] else [] - - def item_type(self, item): - if "cid" in item: - return "comment" - if "pid" in item: - return "post" - return "user" - - def item_user(self, item): - return (item["user"], item["uid"]) if "uid" in item and item["user"] != "[Deleted]" else (None, None) - - def boards(self): - return [b.replace("\\_", "_") for b in self._boards if not b.startswith("_")] - - def posts_url(self, board, page=1): - return "%s/api/getPostList/%s/new/%d" % (self._url, board, page) - - def comments_url(self, post_id, page=1): - return "%s/api/getPostComments/%s/%d" % (self._url, str(post_id), page) - - def user_url(self, username): - return "%s/api/getUser/%s" % (self._url, username) - - def parse_posts_list(self, r, board): - try: - j = json.loads(r.content.decode('utf-8', 'ignore')) - if "posts" not in j: - logger.warning("No posts in response for %s: %s" % (r.url, r.text,)) - return [], None - except JSONDecodeError: - logger.warning("JSONDecodeError for %s:" % (r.url,)) - logger.warning(r.text) - return [], None - - posts = j["posts"] - if len(posts) == 25: - if len(r.history): - page = 1 - else: - page = int(r.url.split("/")[-1]) - if page + 1 > PF_MAX_PAGE: - return posts, None - return posts, self.posts_url(board, page=page + 1) - return posts, None - - def parse_comments(self, r): - try: - j = json.loads(r.content.decode('utf-8', 'ignore')) - except JSONDecodeError: - logger.warning("JSONDecodeError for %s:" % (r.url,)) - logger.warning(r.text) - return [] - - comments = j["comments"] - if len(comments) == 50: - if len(r.history): - pid = int(r.url.split("/")[-1]) - page = 1 - else: - pid = int(r.url.split("/")[-2]) - page = int(r.url.split("/")[-1]) - return comments, self.comments_url(pid, page=page + 1) - return comments, None - class PoalScanner: - def __init__(self, state: PoalState, helper: PoalHelper): + def __init__(self, state: PoalState): self._state = state - self._helper = helper self._web = get_web() - def _posts(self, board): - r = self._web.get(self._helper.posts_url(board)) - if not r or r.status_code != 200: - return [] + def _parse_post(self, r, soup, pid): + sub = r.url.split("/")[-2] - while True: - threads, next_url = self._helper.parse_posts_list(r, board) - for thread in threads: - yield thread - if not next_url: - break - r = self._web.get(next_url) - if not r or r.status_code != 200: - break + post = { + "_id": pid, + "pid": pid, + "sub": sub, + } - def _fetch_comments(self, post): - r = self._web.get(self._helper.comments_url(post["pid"])) - if not r or r.status_code != 200: - return [] - - while True: - comments, next_url = self._helper.parse_comments(r) - for comment in comments: - yield comment - if not next_url: - break - r = self._web.get(next_url) - if not r or r.status_code != 200: - break - - def _fetch_user(self, username): - r = self._web.get(self._helper.user_url(username)) - if not r or r.status_code != 200: - return None - return self.parse_user(r) - - def parse_user(self, r): try: - j = json.loads(r.content.decode('utf-8', 'ignore')) - if "error" in j: - return None - return j - except JSONDecodeError: - logger.warning("JSONDecodeError for %s:" % (r.url,)) - logger.warning(r.text) + post["user"] = soup.find("div", class_="postinfo").find("a", href=lambda x: x and x.startswith("/u/")).text + except: + post["user"] = "[deleted]" - def _fetch_user_from_item(self, item): - user, uid = self._helper.item_user(item) - if user and not self._state.has_visited_user(uid): - j = self._fetch_user(user) - if j and "user" in j: - j["user"]["uid"] = uid - self._state.mark_user_as_visited(uid) - return j["user"] - return None + post["score"] = int(soup.find("div", class_="score").text) + post["title"] = soup.find("a", id="title").text.strip() + post["link"] = soup.find("a", id="title").get("href") + + post["upvotes"] = int(soup.find("a", class_="pscorep").text) + post["downvotes"] = int(soup.find("a", class_="pscoren").text) + + post["posted"] = soup.find("div", id="postinfo").find("time-ago").get("datetime") + content_elem = soup.find("div", id="postcontent") + if content_elem: + post["content"] = str(content_elem) + + return post + + def _parse_comments(self, r, soup, pid): + + for comment_elem in soup.find_all("article"): + sub = r.url.split("/")[-2] + + comment = { + # Save v2 comments on purpose because we save the parent_pid field and not in v1 + "_id": "v2_" + comment_elem.get("id"), + "_sub": sub, + "cid": comment_elem.get("id"), + "parent_pid": pid, + "content": str(comment_elem.find("div", class_="content")), + "posted": comment_elem.find("time-ago").get("datetime") + } + + comment_head = comment_elem.find("div", class_="commenthead") + author_elem = comment_head.find("a", href=lambda x: x and x.startswith("/u/")) + if author_elem: + comment["user"] = author_elem.text + else: + comment["user"] = "[deleted]" + + parent_elem = comment_elem.parent + if parent_elem.get("id").startswith("child"): + comment["parentcid"] = parent_elem.get("id")[len("child-"):] + + yield comment def all_items(self): - for board in self._helper.boards(): - for post in self._posts(board): - cur_board = post["sub"] - yield post, cur_board + for pid in range(1, 500_000): + if self._state.has_visited(pid): + continue + url = f"https://poal.co/s/all/{pid}" - user = self._fetch_user_from_item(post) - if user: - yield user, cur_board + r = self._web.get(url) + if r.status_code == 404: + # Assume that we reached the end (?) for now + return - if self._state.has_new_comments(post, self._helper): - for comment in self._fetch_comments(post): - yield comment, cur_board - user = self._fetch_user_from_item(post) - if user: - yield user, cur_board - self._state.mark_post_as_visited(post, self._helper) + if r.status_code == 406: + # " This sub is disabled You're not allowed to see this stuff" + self._state.mark_visited(pid) + continue + + soup = BeautifulSoup(r.content, "html.parser") + + yield self._parse_post(r, soup, pid), "post" + + for com in self._parse_comments(r, soup, pid): + yield com, "comment" + + self._state.mark_visited(pid) diff --git a/post_process.py b/post_process.py deleted file mode 100644 index 03205c3..0000000 --- a/post_process.py +++ /dev/null @@ -1,22 +0,0 @@ -from hexlib.regex import LINK_RE - - -def post_process(item, board, helper): - item["_v"] = 1.0 - item["_id"] = helper.item_unique_id(item) - - if helper.item_type(item) != "user": - item["_urls"] = helper.item_urls(item) - if helper.item_type(item) == "comment": - item["_sub"] = board - - return item - - -def get_links_from_body(body): - result = [] - - for match in LINK_RE.finditer(body): - url = match.group(1) - result.append(url) - return result diff --git a/requirements.txt b/requirements.txt index ddbb911..a1dfc64 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ requests -urllib3 redis git+git://github.com/simon987/hexlib.git +bs4 diff --git a/run.py b/run.py index ec8c9f8..9bc3b7a 100644 --- a/run.py +++ b/run.py @@ -1,64 +1,32 @@ import json -from queue import Queue -from threading import Thread -from hexlib.concurrency import queue_iter -from hexlib.env import get_redis +from hexlib.env import get_redis, redis_publish -from poal import PoalScanner, PoalHelper -from post_process import post_process +from poal import PoalScanner from state import PoalState -def publish_worker(queue: Queue, helper): - for item, board in queue_iter(queue): - publish(item, board, helper) - - -def once(func): - def wrapper(item, board, helper): - if not state.has_visited(helper.item_unique_id(item)): - func(item, board, helper) - state.mark_visited(helper.item_unique_id(item)) - - return wrapper - - -@once -def publish(item, board, helper): - post_process(item, board, helper) - - item_type = helper.item_type(item) - routing_key = "%s.%s" % (item_type, board) +def publish(item, item_type): + item["_v"] = 2.0 + board = item["_sub"] if "_sub" in item else item["sub"] message = json.dumps(item, separators=(',', ':'), ensure_ascii=False, sort_keys=True) - rdb.lpush("arc.poal." + routing_key, message) + redis_publish( + rdb, + item=message, + item_type=item_type, + item_project="poal", + item_category=board + ) -HELPER = PoalHelper( - boards=( - "all", - # TODO: Are there hidden boards that do not show up in /all ? - ), - url="https://poal.co" -) if __name__ == "__main__": - - state = PoalState("poal") + state = PoalState("poalv2") rdb = get_redis() - publish_q = Queue() - publish_thread = Thread(target=publish_worker, args=(publish_q, HELPER)) - publish_thread.setDaemon(True) - publish_thread.start() + s = PoalScanner(state) - s = PoalScanner(state, HELPER) while True: - try: - for item, board in s.all_items(): - publish_q.put((item, board)) - except KeyboardInterrupt as e: - print("cleanup..") - publish_q.put((None, None)) - break + for item, item_type in s.all_items(): + publish(item, item_type) diff --git a/state.py b/state.py index 9ad8b12..1ab7847 100644 --- a/state.py +++ b/state.py @@ -1,27 +1,12 @@ -from hexlib.db import VolatileState, VolatileBooleanState +from hexlib.db import VolatileBooleanState class PoalState: def __init__(self, prefix): - self._posts = VolatileState(prefix) - self._comments = VolatileBooleanState(prefix) - self._users = VolatileBooleanState(prefix) + self._state = VolatileBooleanState(prefix, sep=".") - def has_visited(self, item_id): - return self._comments["comments"][item_id] + def mark_visited(self, pid): + self._state["pid"][pid] = True - def mark_visited(self, item_id): - self._comments["comments"][item_id] = True - - def mark_post_as_visited(self, post, helper): - self._posts["posts"][helper.item_unique_id(post)] = post["comments"] - - def has_new_comments(self, post, helper): - comment_count = self._posts["posts"][helper.item_unique_id(post)] - return comment_count is None or post["comments"] > comment_count - - def has_visited_user(self, uid): - return self._users["users"][uid.replace("-", "")] - - def mark_user_as_visited(self, uid): - self._users["users"][uid.replace("-", "")] = True + def has_visited(self, pid): + return self._state["pid"][pid]