from bs4 import BeautifulSoup from hexlib.env import get_web from state import PoalState class PoalScanner: def __init__(self, state: PoalState): self._state = state self._web = get_web() def _parse_post(self, r, soup, pid): sub = r.url.split("/")[-2] post = { "_id": pid, "pid": pid, "sub": sub, } try: post["user"] = soup.find("div", class_="postinfo").find("a", href=lambda x: x and x.startswith("/u/")).text except: post["user"] = "[deleted]" post["score"] = int(soup.find("div", class_="score").text) post["title"] = soup.find("a", id="title").text.strip() post["link"] = soup.find("a", id="title").get("href") post["upvotes"] = int(soup.find("a", class_="pscorep").text) post["downvotes"] = int(soup.find("a", class_="pscoren").text) post["posted"] = soup.find("div", id="postinfo").find("time-ago").get("datetime") content_elem = soup.find("div", id="postcontent") if content_elem: post["content"] = str(content_elem) return post def _parse_comments(self, r, soup, pid): for comment_elem in soup.find_all("article"): sub = r.url.split("/")[-2] comment = { # Save v2 comments on purpose because we save the parent_pid field and not in v1 "_id": "v2_" + comment_elem.get("id"), "_sub": sub, "cid": comment_elem.get("id"), "parent_pid": pid, "content": str(comment_elem.find("div", class_="content")), "posted": comment_elem.find("time-ago").get("datetime") } comment_head = comment_elem.find("div", class_="commenthead") author_elem = comment_head.find("a", href=lambda x: x and x.startswith("/u/")) if author_elem: comment["user"] = author_elem.text else: comment["user"] = "[deleted]" parent_elem = comment_elem.parent if parent_elem.get("id").startswith("child"): comment["parentcid"] = parent_elem.get("id")[len("child-"):] yield comment def all_items(self): not_found_in_a_row = 0 for pid in range(1, 500_000): if self._state.has_visited(pid): continue url = f"https://poal.co/s/all/{pid}" r = self._web.get(url) if r.status_code == 404: not_found_in_a_row += 1 if not_found_in_a_row > 10: break if self._state.has_visited(pid + 1): self._state.mark_visited(pid) continue not_found_in_a_row = 0 if r.status_code == 406: # " This sub is disabled You're not allowed to see this stuff" self._state.mark_visited(pid) continue soup = BeautifulSoup(r.content, "html.parser") yield self._parse_post(r, soup, pid), "post" for com in self._parse_comments(r, soup, pid): yield com, "comment" self._state.mark_visited(pid)