mirror of
https://github.com/simon987/poal_feed.git
synced 2025-04-08 05:06:47 +00:00
v2 update
This commit is contained in:
parent
d2bfcea44d
commit
888c60c269
@ -8,4 +8,4 @@ services:
|
||||
- "REDIS_HOST="
|
||||
- "RPS=1"
|
||||
- "USER_AGENT=firefox"
|
||||
- "PF_MAX_PAGE=300"
|
||||
- "PROXY="
|
||||
|
241
poal.py
241
poal.py
@ -1,185 +1,92 @@
|
||||
import json
|
||||
import os
|
||||
from json import JSONDecodeError
|
||||
from urllib.parse import urljoin
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from hexlib.env import get_web
|
||||
from hexlib.log import logger
|
||||
|
||||
from post_process import get_links_from_body
|
||||
from state import PoalState
|
||||
|
||||
PF_MAX_PAGE = int(os.environ.get("PF_MAX_PAGE", 9999999))
|
||||
|
||||
|
||||
class PoalHelper:
|
||||
|
||||
def __init__(self, url, boards):
|
||||
self._boards = boards
|
||||
self._url = url
|
||||
|
||||
def item_unique_id(self, item):
|
||||
item_type = self.item_type(item)
|
||||
if item_type == "post":
|
||||
return item["pid"]
|
||||
if item_type == "comment":
|
||||
return item["cid"]
|
||||
|
||||
return item["uid"]
|
||||
|
||||
def item_urls(self, item):
|
||||
item_type = self.item_type(item)
|
||||
if item_type == "post":
|
||||
urls = [
|
||||
item["link"],
|
||||
*(get_links_from_body(item["content"]) if item["content"] else [])
|
||||
]
|
||||
if item["thumbnail"]:
|
||||
urls.append(urljoin("https://poal.co/static/thumbs/", item["thumbnail"]))
|
||||
return urls
|
||||
return get_links_from_body(item["content"]) if item["content"] else []
|
||||
|
||||
def item_type(self, item):
|
||||
if "cid" in item:
|
||||
return "comment"
|
||||
if "pid" in item:
|
||||
return "post"
|
||||
return "user"
|
||||
|
||||
def item_user(self, item):
|
||||
return (item["user"], item["uid"]) if "uid" in item and item["user"] != "[Deleted]" else (None, None)
|
||||
|
||||
def boards(self):
|
||||
return [b.replace("\\_", "_") for b in self._boards if not b.startswith("_")]
|
||||
|
||||
def posts_url(self, board, page=1):
|
||||
return "%s/api/getPostList/%s/new/%d" % (self._url, board, page)
|
||||
|
||||
def comments_url(self, post_id, page=1):
|
||||
return "%s/api/getPostComments/%s/%d" % (self._url, str(post_id), page)
|
||||
|
||||
def user_url(self, username):
|
||||
return "%s/api/getUser/%s" % (self._url, username)
|
||||
|
||||
def parse_posts_list(self, r, board):
|
||||
try:
|
||||
j = json.loads(r.content.decode('utf-8', 'ignore'))
|
||||
if "posts" not in j:
|
||||
logger.warning("No posts in response for %s: %s" % (r.url, r.text,))
|
||||
return [], None
|
||||
except JSONDecodeError:
|
||||
logger.warning("JSONDecodeError for %s:" % (r.url,))
|
||||
logger.warning(r.text)
|
||||
return [], None
|
||||
|
||||
posts = j["posts"]
|
||||
if len(posts) == 25:
|
||||
if len(r.history):
|
||||
page = 1
|
||||
else:
|
||||
page = int(r.url.split("/")[-1])
|
||||
if page + 1 > PF_MAX_PAGE:
|
||||
return posts, None
|
||||
return posts, self.posts_url(board, page=page + 1)
|
||||
return posts, None
|
||||
|
||||
def parse_comments(self, r):
|
||||
try:
|
||||
j = json.loads(r.content.decode('utf-8', 'ignore'))
|
||||
except JSONDecodeError:
|
||||
logger.warning("JSONDecodeError for %s:" % (r.url,))
|
||||
logger.warning(r.text)
|
||||
return []
|
||||
|
||||
comments = j["comments"]
|
||||
if len(comments) == 50:
|
||||
if len(r.history):
|
||||
pid = int(r.url.split("/")[-1])
|
||||
page = 1
|
||||
else:
|
||||
pid = int(r.url.split("/")[-2])
|
||||
page = int(r.url.split("/")[-1])
|
||||
return comments, self.comments_url(pid, page=page + 1)
|
||||
return comments, None
|
||||
|
||||
|
||||
class PoalScanner:
|
||||
|
||||
def __init__(self, state: PoalState, helper: PoalHelper):
|
||||
def __init__(self, state: PoalState):
|
||||
self._state = state
|
||||
self._helper = helper
|
||||
self._web = get_web()
|
||||
|
||||
def _posts(self, board):
|
||||
r = self._web.get(self._helper.posts_url(board))
|
||||
if not r or r.status_code != 200:
|
||||
return []
|
||||
def _parse_post(self, r, soup, pid):
|
||||
sub = r.url.split("/")[-2]
|
||||
|
||||
while True:
|
||||
threads, next_url = self._helper.parse_posts_list(r, board)
|
||||
for thread in threads:
|
||||
yield thread
|
||||
if not next_url:
|
||||
break
|
||||
r = self._web.get(next_url)
|
||||
if not r or r.status_code != 200:
|
||||
break
|
||||
post = {
|
||||
"_id": pid,
|
||||
"pid": pid,
|
||||
"sub": sub,
|
||||
}
|
||||
|
||||
def _fetch_comments(self, post):
|
||||
r = self._web.get(self._helper.comments_url(post["pid"]))
|
||||
if not r or r.status_code != 200:
|
||||
return []
|
||||
|
||||
while True:
|
||||
comments, next_url = self._helper.parse_comments(r)
|
||||
for comment in comments:
|
||||
yield comment
|
||||
if not next_url:
|
||||
break
|
||||
r = self._web.get(next_url)
|
||||
if not r or r.status_code != 200:
|
||||
break
|
||||
|
||||
def _fetch_user(self, username):
|
||||
r = self._web.get(self._helper.user_url(username))
|
||||
if not r or r.status_code != 200:
|
||||
return None
|
||||
return self.parse_user(r)
|
||||
|
||||
def parse_user(self, r):
|
||||
try:
|
||||
j = json.loads(r.content.decode('utf-8', 'ignore'))
|
||||
if "error" in j:
|
||||
return None
|
||||
return j
|
||||
except JSONDecodeError:
|
||||
logger.warning("JSONDecodeError for %s:" % (r.url,))
|
||||
logger.warning(r.text)
|
||||
post["user"] = soup.find("div", class_="postinfo").find("a", href=lambda x: x and x.startswith("/u/")).text
|
||||
except:
|
||||
post["user"] = "[deleted]"
|
||||
|
||||
def _fetch_user_from_item(self, item):
|
||||
user, uid = self._helper.item_user(item)
|
||||
if user and not self._state.has_visited_user(uid):
|
||||
j = self._fetch_user(user)
|
||||
if j and "user" in j:
|
||||
j["user"]["uid"] = uid
|
||||
self._state.mark_user_as_visited(uid)
|
||||
return j["user"]
|
||||
return None
|
||||
post["score"] = int(soup.find("div", class_="score").text)
|
||||
post["title"] = soup.find("a", id="title").text.strip()
|
||||
post["link"] = soup.find("a", id="title").get("href")
|
||||
|
||||
post["upvotes"] = int(soup.find("a", class_="pscorep").text)
|
||||
post["downvotes"] = int(soup.find("a", class_="pscoren").text)
|
||||
|
||||
post["posted"] = soup.find("div", id="postinfo").find("time-ago").get("datetime")
|
||||
content_elem = soup.find("div", id="postcontent")
|
||||
if content_elem:
|
||||
post["content"] = str(content_elem)
|
||||
|
||||
return post
|
||||
|
||||
def _parse_comments(self, r, soup, pid):
|
||||
|
||||
for comment_elem in soup.find_all("article"):
|
||||
sub = r.url.split("/")[-2]
|
||||
|
||||
comment = {
|
||||
# Save v2 comments on purpose because we save the parent_pid field and not in v1
|
||||
"_id": "v2_" + comment_elem.get("id"),
|
||||
"_sub": sub,
|
||||
"cid": comment_elem.get("id"),
|
||||
"parent_pid": pid,
|
||||
"content": str(comment_elem.find("div", class_="content")),
|
||||
"posted": comment_elem.find("time-ago").get("datetime")
|
||||
}
|
||||
|
||||
comment_head = comment_elem.find("div", class_="commenthead")
|
||||
author_elem = comment_head.find("a", href=lambda x: x and x.startswith("/u/"))
|
||||
if author_elem:
|
||||
comment["user"] = author_elem.text
|
||||
else:
|
||||
comment["user"] = "[deleted]"
|
||||
|
||||
parent_elem = comment_elem.parent
|
||||
if parent_elem.get("id").startswith("child"):
|
||||
comment["parentcid"] = parent_elem.get("id")[len("child-"):]
|
||||
|
||||
yield comment
|
||||
|
||||
def all_items(self):
|
||||
for board in self._helper.boards():
|
||||
for post in self._posts(board):
|
||||
cur_board = post["sub"]
|
||||
yield post, cur_board
|
||||
for pid in range(1, 500_000):
|
||||
if self._state.has_visited(pid):
|
||||
continue
|
||||
url = f"https://poal.co/s/all/{pid}"
|
||||
|
||||
user = self._fetch_user_from_item(post)
|
||||
if user:
|
||||
yield user, cur_board
|
||||
r = self._web.get(url)
|
||||
if r.status_code == 404:
|
||||
# Assume that we reached the end (?) for now
|
||||
return
|
||||
|
||||
if self._state.has_new_comments(post, self._helper):
|
||||
for comment in self._fetch_comments(post):
|
||||
yield comment, cur_board
|
||||
user = self._fetch_user_from_item(post)
|
||||
if user:
|
||||
yield user, cur_board
|
||||
self._state.mark_post_as_visited(post, self._helper)
|
||||
if r.status_code == 406:
|
||||
# " This sub is disabled You're not allowed to see this stuff"
|
||||
self._state.mark_visited(pid)
|
||||
continue
|
||||
|
||||
soup = BeautifulSoup(r.content, "html.parser")
|
||||
|
||||
yield self._parse_post(r, soup, pid), "post"
|
||||
|
||||
for com in self._parse_comments(r, soup, pid):
|
||||
yield com, "comment"
|
||||
|
||||
self._state.mark_visited(pid)
|
||||
|
@ -1,22 +0,0 @@
|
||||
from hexlib.regex import LINK_RE
|
||||
|
||||
|
||||
def post_process(item, board, helper):
|
||||
item["_v"] = 1.0
|
||||
item["_id"] = helper.item_unique_id(item)
|
||||
|
||||
if helper.item_type(item) != "user":
|
||||
item["_urls"] = helper.item_urls(item)
|
||||
if helper.item_type(item) == "comment":
|
||||
item["_sub"] = board
|
||||
|
||||
return item
|
||||
|
||||
|
||||
def get_links_from_body(body):
|
||||
result = []
|
||||
|
||||
for match in LINK_RE.finditer(body):
|
||||
url = match.group(1)
|
||||
result.append(url)
|
||||
return result
|
@ -1,4 +1,4 @@
|
||||
requests
|
||||
urllib3
|
||||
redis
|
||||
git+git://github.com/simon987/hexlib.git
|
||||
bs4
|
||||
|
64
run.py
64
run.py
@ -1,64 +1,32 @@
|
||||
import json
|
||||
from queue import Queue
|
||||
from threading import Thread
|
||||
|
||||
from hexlib.concurrency import queue_iter
|
||||
from hexlib.env import get_redis
|
||||
from hexlib.env import get_redis, redis_publish
|
||||
|
||||
from poal import PoalScanner, PoalHelper
|
||||
from post_process import post_process
|
||||
from poal import PoalScanner
|
||||
from state import PoalState
|
||||
|
||||
|
||||
def publish_worker(queue: Queue, helper):
|
||||
for item, board in queue_iter(queue):
|
||||
publish(item, board, helper)
|
||||
|
||||
|
||||
def once(func):
|
||||
def wrapper(item, board, helper):
|
||||
if not state.has_visited(helper.item_unique_id(item)):
|
||||
func(item, board, helper)
|
||||
state.mark_visited(helper.item_unique_id(item))
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
@once
|
||||
def publish(item, board, helper):
|
||||
post_process(item, board, helper)
|
||||
|
||||
item_type = helper.item_type(item)
|
||||
routing_key = "%s.%s" % (item_type, board)
|
||||
def publish(item, item_type):
|
||||
item["_v"] = 2.0
|
||||
|
||||
board = item["_sub"] if "_sub" in item else item["sub"]
|
||||
message = json.dumps(item, separators=(',', ':'), ensure_ascii=False, sort_keys=True)
|
||||
rdb.lpush("arc.poal." + routing_key, message)
|
||||
|
||||
redis_publish(
|
||||
rdb,
|
||||
item=message,
|
||||
item_type=item_type,
|
||||
item_project="poal",
|
||||
item_category=board
|
||||
)
|
||||
|
||||
HELPER = PoalHelper(
|
||||
boards=(
|
||||
"all",
|
||||
# TODO: Are there hidden boards that do not show up in /all ?
|
||||
),
|
||||
url="https://poal.co"
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
state = PoalState("poal")
|
||||
state = PoalState("poalv2")
|
||||
rdb = get_redis()
|
||||
|
||||
publish_q = Queue()
|
||||
publish_thread = Thread(target=publish_worker, args=(publish_q, HELPER))
|
||||
publish_thread.setDaemon(True)
|
||||
publish_thread.start()
|
||||
s = PoalScanner(state)
|
||||
|
||||
s = PoalScanner(state, HELPER)
|
||||
while True:
|
||||
try:
|
||||
for item, board in s.all_items():
|
||||
publish_q.put((item, board))
|
||||
except KeyboardInterrupt as e:
|
||||
print("cleanup..")
|
||||
publish_q.put((None, None))
|
||||
break
|
||||
for item, item_type in s.all_items():
|
||||
publish(item, item_type)
|
||||
|
27
state.py
27
state.py
@ -1,27 +1,12 @@
|
||||
from hexlib.db import VolatileState, VolatileBooleanState
|
||||
from hexlib.db import VolatileBooleanState
|
||||
|
||||
|
||||
class PoalState:
|
||||
def __init__(self, prefix):
|
||||
self._posts = VolatileState(prefix)
|
||||
self._comments = VolatileBooleanState(prefix)
|
||||
self._users = VolatileBooleanState(prefix)
|
||||
self._state = VolatileBooleanState(prefix, sep=".")
|
||||
|
||||
def has_visited(self, item_id):
|
||||
return self._comments["comments"][item_id]
|
||||
def mark_visited(self, pid):
|
||||
self._state["pid"][pid] = True
|
||||
|
||||
def mark_visited(self, item_id):
|
||||
self._comments["comments"][item_id] = True
|
||||
|
||||
def mark_post_as_visited(self, post, helper):
|
||||
self._posts["posts"][helper.item_unique_id(post)] = post["comments"]
|
||||
|
||||
def has_new_comments(self, post, helper):
|
||||
comment_count = self._posts["posts"][helper.item_unique_id(post)]
|
||||
return comment_count is None or post["comments"] > comment_count
|
||||
|
||||
def has_visited_user(self, uid):
|
||||
return self._users["users"][uid.replace("-", "")]
|
||||
|
||||
def mark_user_as_visited(self, uid):
|
||||
self._users["users"][uid.replace("-", "")] = True
|
||||
def has_visited(self, pid):
|
||||
return self._state["pid"][pid]
|
||||
|
Loading…
x
Reference in New Issue
Block a user