diff --git a/README.md b/README.md index 8dbd9b0..5307c21 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ image boards and publishes serialised JSON to RabbitMQ Compatible image boards: 4chan, lainchan, uboachan, 22chan, wizchan, 1chan, 2ch.hk, endchan, 38chan, alokal, -horochan, doushio, desuchan. +horochan, doushio, desuchan, tgchan. Can optionally push monitoring data to InfluxDB. Below is an example of Grafana being used to display it. diff --git a/chan/chan.py b/chan/chan.py index 4c62a8c..ca01d5e 100644 --- a/chan/chan.py +++ b/chan/chan.py @@ -6,6 +6,7 @@ from chan.json import JsonChanHelper from chan.mayuri import MayuriChanHelper from chan.russian_json import RussianJsonChanHelper from chan.synch_json import SynchJsonChanHelper +from chan.tgchan_html import TgChanHtmlChanHelper CHANS = { "4chan": JsonChanHelper( @@ -243,4 +244,15 @@ CHANS = { ), rps=1/20 ), + "tgchan": TgChanHtmlChanHelper( + 19, + "https://tgchan.org/kusaba/", + "https://tgchan.org/kusaba/", + "/res/", + "/src/", + ( + "draw", "meep", "quest", "questdis", "tg", "icons", + ), + rps=1, + ) } diff --git a/chan/desuchan_html.py b/chan/desuchan_html.py index 0a26312..c9d2cea 100644 --- a/chan/desuchan_html.py +++ b/chan/desuchan_html.py @@ -60,8 +60,6 @@ class DesuChanHtmlChanHelper(ChanHelper): for post_el in op_el.find_all("table", recursive=False): label = post_el.find("label") - if not label: - print(post_el) *_, time = label.children yield { "id": int(post_el.find("td", attrs={"class", "reply"}).get("id")[5:]), diff --git a/chan/tgchan_html.py b/chan/tgchan_html.py new file mode 100644 index 0000000..9a65d35 --- /dev/null +++ b/chan/tgchan_html.py @@ -0,0 +1,51 @@ +import datetime +import re +from urllib.parse import urljoin + +from bs4 import BeautifulSoup + +from chan.desuchan_html import DesuChanHtmlChanHelper + + +class TgChanHtmlChanHelper(DesuChanHtmlChanHelper): + + def parse_threads_list(self, r): + soup = BeautifulSoup(r.text, "html.parser") + + threads = [] + + for threadEl in soup.find_all("div", id=lambda tid: tid and tid[6:7].isdigit()): + threads.append({ + "id": int(re.search("thread([0-9]+)[a-zA-Z]*", threadEl.get("id")).group(1)), + }) + + for form in soup.find_all("form"): + next_button = form.find("input", attrs={"value": "Next"}) + if next_button and form.get("action") != "none": + return threads, urljoin(self._base_url, form.get("action")) + return threads, None + + @staticmethod + def parse_thread(r): + soup = BeautifulSoup(r.text, "html.parser") + + op_el = soup.find("form", id="delform") + + for post_el in op_el.find_all("table", recursive=False): + label = post_el.find("label") + *_, time = label.children + yield { + "id": int(post_el.find("td", attrs={"class", "reply"}).get("id")[5:]), + "type": "post", + "html": str(post_el), + "time": int(datetime.datetime.strptime(time, "\n\n%Y/%m/%d(%a)%H:%M\n").timestamp()) + } + post_el.decompose() + + *_, time = op_el.find("label").children + yield { + "id": int(op_el.find("a", attrs={"name": lambda x: x and x.isdigit()}).get("name")), + "type": "thread", + "html": str(op_el), + "time": int(datetime.datetime.strptime(time, "\n\n%Y/%m/%d(%a)%H:%M\n").timestamp()) + }