From ee666496e0f1670dc7e1c903bdcb0c4b78b703d7 Mon Sep 17 00:00:00 2001 From: simon Date: Fri, 22 Nov 2019 12:14:23 -0500 Subject: [PATCH] add plus4chan --- chan/chan.py | 13 +++++++ chan/kev4_php.py | 2 +- chan/plus4chan_html.py | 81 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 95 insertions(+), 1 deletion(-) create mode 100644 chan/plus4chan_html.py diff --git a/chan/chan.py b/chan/chan.py index 6da4632..5bb1387 100644 --- a/chan/chan.py +++ b/chan/chan.py @@ -14,6 +14,7 @@ from chan.kev4_php import Kev4PhpHelper from chan.lolnada_html import LolNadaHtmlChanHelper from chan.mayuri import MayuriChanHelper from chan.nowere_html import NowereHtmlChanHelper +from chan.plus4chan_html import Plus4ChanHelper from chan.russian_json import RussianJsonChanHelper from chan.synch_json import SynchJsonChanHelper from chan.tgchan_html import TgChanHtmlChanHelper @@ -486,4 +487,16 @@ CHANS = { ), rps=1 / 20 ), + "plus4chan": Plus4ChanHelper( + 33, + "https://boards.plus4chan.org/", + "https://boards.plus4chan.org/", + "", + "", + ( + "baw", "co", "cog", "jam", "mtv", + "coc", "draw", "pco", "coq", "cod", "a" + ), + rps=1 / 15 + ), } diff --git a/chan/kev4_php.py b/chan/kev4_php.py index 6575096..c8f3524 100644 --- a/chan/kev4_php.py +++ b/chan/kev4_php.py @@ -41,7 +41,7 @@ class Kev4PhpHelper(ChanHelper): for threadEl in soup.find_all("p", class_="info"): threads.append({ - "id": int(threadEl.find("a").get("id")[len("expandButtun"):]), + "id": int(threadEl.find("a").get("id")[len("expandButton"):]), "omit": int(threadEl.text.split(" ")[1]) }) diff --git a/chan/plus4chan_html.py b/chan/plus4chan_html.py new file mode 100644 index 0000000..87fbaf4 --- /dev/null +++ b/chan/plus4chan_html.py @@ -0,0 +1,81 @@ +import datetime +from urllib.parse import urljoin + +from bs4 import BeautifulSoup + +from chan.helper import ChanHelper +from post_process import get_links_from_html_body + + +class Plus4ChanHelper(ChanHelper): + + def threads_url(self, board): + return "%s%s/" % (self._base_url, board) + + def posts_url(self, board, thread): + return "%s%s/t%d.html" % (self._base_url, board, self.item_id(thread)) + + @staticmethod + def item_id(item): + return item["id"] + + def item_urls(self, item, board): + return list(set(get_links_from_html_body(item["html"], self._base_url))) + + @staticmethod + def item_type(item): + return item["type"] + + @staticmethod + def thread_mtime(thread): + return thread["omit"] + + @staticmethod + def item_mtime(item): + return item["time"] + + def parse_threads_list(self, r): + soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'), "html.parser") + + threads = [] + + for threadEl in soup.find_all("section", class_="t", id=lambda x: x and x[1:].isnumeric()): + + omit = threadEl.find("a", class_="omittedbreakdown") + + threads.append({ + "id": int(threadEl.get("id")[1:]), + "omit": int(omit.text.split(" ")[1]) if omit else 0 + }) + + for link in soup.find_all("a", href=lambda x: x): + if link.text == "next": + return threads, urljoin(r.url, link.get("href")) + return threads, None + + @staticmethod + def parse_thread(r): + soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'), "html.parser") + + thread_el = soup.find("section", class_="t") + tid = int(thread_el.get("id")[1:]) + + for post_el in soup.find_all("div", class_="p", id=lambda x: x and x[1:].isnumeric()): + pid = int(post_el.get("id")[1:]) + if pid == tid: + yield { + "id": tid, + "type": "thread", + "html": str(post_el), + "time": int(datetime.datetime.strptime(post_el.find("time", class_="date").text, + "%Y/%m/%d %H:%M:%S").timestamp()) + } + else: + yield { + "id": pid, + "type": "post", + "html": str(post_el), + "time": int(datetime.datetime.strptime(post_el.find("time", class_="date").text, + "%Y/%m/%d %H:%M:%S").timestamp()), + "parent": tid + }