poal_feed/poal.py

104 lines
3.2 KiB
Python

from bs4 import BeautifulSoup
from hexlib.env import get_web
from state import PoalState
class PoalScanner:
def __init__(self, state: PoalState):
self._state = state
self._web = get_web()
def _parse_post(self, r, soup, pid):
sub = r.url.split("/")[-2]
post = {
"_id": pid,
"pid": pid,
"sub": sub,
}
try:
post["user"] = soup.find("div", class_="postinfo").find("a", href=lambda x: x and x.startswith("/u/")).text
except:
post["user"] = "[deleted]"
post["score"] = int(soup.find("div", class_="score").text)
post["title"] = soup.find("a", id="title").text.strip()
post["link"] = soup.find("a", id="title").get("href")
post["upvotes"] = int(soup.find("a", class_="pscorep").text)
post["downvotes"] = int(soup.find("a", class_="pscoren").text)
post["posted"] = soup.find("div", id="postinfo").find("time-ago").get("datetime")
content_elem = soup.find("div", id="postcontent")
if content_elem:
post["content"] = str(content_elem)
return post
def _parse_comments(self, r, soup, pid):
for comment_elem in soup.find_all("article"):
sub = r.url.split("/")[-2]
comment = {
# Save v2 comments on purpose because we save the parent_pid field and not in v1
"_id": "v2_" + comment_elem.get("id"),
"_sub": sub,
"cid": comment_elem.get("id"),
"parent_pid": pid,
"content": str(comment_elem.find("div", class_="content")),
"posted": comment_elem.find("time-ago").get("datetime")
}
comment_head = comment_elem.find("div", class_="commenthead")
author_elem = comment_head.find("a", href=lambda x: x and x.startswith("/u/"))
if author_elem:
comment["user"] = author_elem.text
else:
comment["user"] = "[deleted]"
parent_elem = comment_elem.parent
if parent_elem.get("id").startswith("child"):
comment["parentcid"] = parent_elem.get("id")[len("child-"):]
yield comment
def all_items(self):
not_found_in_a_row = 0
for pid in range(1, 500_000):
if self._state.has_visited(pid):
continue
url = f"https://poal.co/s/all/{pid}"
r = self._web.get(url)
if r.status_code == 404:
not_found_in_a_row += 1
if not_found_in_a_row > 10:
break
if self._state.has_visited(pid + 1):
self._state.mark_visited(pid)
continue
not_found_in_a_row = 0
if r.status_code == 406:
# " This sub is disabled You're not allowed to see this stuff"
self._state.mark_visited(pid)
continue
soup = BeautifulSoup(r.content, "html.parser")
yield self._parse_post(r, soup, pid), "post"
for com in self._parse_comments(r, soup, pid):
yield com, "comment"
self._state.mark_visited(pid)