From b6c42c1db3cb9f5c5d48dbedaab4cddf73198b09 Mon Sep 17 00:00:00 2001 From: simon Date: Sun, 8 Sep 2019 19:53:06 -0400 Subject: [PATCH] add fchan --- chan/chan.py | 14 ++++++++++- chan/desuchan_html.py | 5 +--- chan/fchan_html.py | 58 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 5 deletions(-) create mode 100644 chan/fchan_html.py diff --git a/chan/chan.py b/chan/chan.py index 4995821..8638200 100644 --- a/chan/chan.py +++ b/chan/chan.py @@ -2,6 +2,7 @@ from chan.alokal_json import AlokalJsonChanHelper from chan.desuchan_html import DesuChanHtmlChanHelper from chan.doushio_html import DoushioHtmlChanHelper from chan.endchan_html import EndchanHtmlChanHelper +from chan.fchan_html import FChanHtmlChanHelper from chan.json import JsonChanHelper from chan.lolnada_html import LolNadaHtmlChanHelper from chan.mayuri import MayuriChanHelper @@ -106,7 +107,7 @@ CHANS = { "a", "fd", "ja", "ma", "vn", "fg", "fur", "gg", "ga", "vape", "h", "ho", "hc", "e", "fet", "sex", "fag" ), - rps=1/10 + rps=1/5 ), "endchan": EndchanHtmlChanHelper( 8, @@ -268,4 +269,15 @@ CHANS = { ), rps=1/20, ), + "fchan": FChanHtmlChanHelper( + 21, + "http://fchan.us/", + "http://fchan.us/", + "/res/", + "/src/", + ( + "f", "m", "h", "s", "toon", "a", "ah", "c", "artist", "crit", "b" + ), + rps=1/60, + ), } diff --git a/chan/desuchan_html.py b/chan/desuchan_html.py index fca50f1..dcba0ab 100644 --- a/chan/desuchan_html.py +++ b/chan/desuchan_html.py @@ -53,10 +53,7 @@ class DesuChanHtmlChanHelper(ChanHelper): def parse_thread(r): soup = BeautifulSoup(r.text, "html.parser") - op_el = None - for div in soup.find_all("div", id=lambda tid: tid and tid[1:].isdigit()): - op_el = div - break + op_el = soup.find("div", id=lambda tid: tid and tid[1:].isdigit()) for post_el in op_el.find_all("table", recursive=False): label = post_el.find("label") diff --git a/chan/fchan_html.py b/chan/fchan_html.py new file mode 100644 index 0000000..3c44149 --- /dev/null +++ b/chan/fchan_html.py @@ -0,0 +1,58 @@ +import datetime +import json +import re +from urllib.parse import urljoin + +from bs4 import BeautifulSoup + +from chan.desuchan_html import DesuChanHtmlChanHelper + + +class FChanHtmlChanHelper(DesuChanHtmlChanHelper): + + def parse_threads_list(self, r): + soup = BeautifulSoup(r.text, "html.parser") + + threads = [] + + for threadEl in soup.find_all("div", id=lambda tid: tid and re.match("thread[0-9]+", tid)): + threads.append({ + "id": int(threadEl.get("id")[6:]), + }) + + next_url = None + for a in soup.find_all("a"): + if a.text == "Next": + next_url = a + break + if next_url: + return threads, urljoin(r.url, next_url.get("href")) + return threads, None + + @staticmethod + def parse_thread(r): + soup = BeautifulSoup(r.text, "html.parser") + + op_el = soup.find("div", id=lambda tid: tid and re.match("thread[0-9]+", tid)) + + is_op = True + + for post_el in op_el.find_all("table", recursive=False): + label = post_el.find("label") + *_, time = label.children + if is_op: + yield { + "id": int(op_el.get("id")[6:]), + "type": "thread", + "html": str(post_el), + "time": int(datetime.datetime.strptime(time.strip(), "%y/%m/%d(%a)%H:%M").timestamp()) + } + is_op = False + else: + yield { + "id": int(post_el.find("td", class_=lambda x: x and "reply" in x).get("id")[5:]), + "type": "post", + "html": str(post_el), + "time": int(datetime.datetime.strptime(time.strip(), "%y/%m/%d(%a)%H:%M").timestamp()) + } +