From e13770d603ebaf23391000cc3360f3803686b434 Mon Sep 17 00:00:00 2001 From: simon Date: Sun, 8 Sep 2019 15:50:05 -0400 Subject: [PATCH] add desuchan --- README.md | 2 +- chan/chan.py | 20 ++++++++++- chan/desuchan_html.py | 80 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 100 insertions(+), 2 deletions(-) create mode 100644 chan/desuchan_html.py diff --git a/README.md b/README.md index d964445..8dbd9b0 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ image boards and publishes serialised JSON to RabbitMQ Compatible image boards: 4chan, lainchan, uboachan, 22chan, wizchan, 1chan, 2ch.hk, endchan, 38chan, alokal, -horochan, doushio. +horochan, doushio, desuchan. Can optionally push monitoring data to InfluxDB. Below is an example of Grafana being used to display it. diff --git a/chan/chan.py b/chan/chan.py index e473c9e..ecc7666 100644 --- a/chan/chan.py +++ b/chan/chan.py @@ -1,4 +1,5 @@ from chan.alokal_json import AlokalJsonChanHelper +from chan.desuchan_html import DesuChanHtmlChanHelper from chan.doushio_html import DoushioHtmlChanHelper from chan.endchan_html import EndchanHtmlChanHelper from chan.json import JsonChanHelper @@ -213,5 +214,22 @@ CHANS = { "moe", ), rps=1 - ) + ), + "desuchan": DesuChanHtmlChanHelper( + 17, + "https://desuchan.net/", + "https://desuchan.net/", + "/res/", + "/src/", + ( + "bananas", "boku", "dawa", "desu", "jum", "kashira", "md", + "otousama", "ro", "unyuu", "yakult", "a", "c", "h", "_loli", + "moonspeak", "nagato", "nij", "nipa", "touhou", "tr", "yan", + "yan", "vn", "do", "fi", "lit", "o", "pro", "tech", "v", "vic", + "arrrrr", "brocastan", "gar", "gif", "media", "ot", "r", "w", + "sandbox", "sugg" + ), + rps=1/10 + ), + } diff --git a/chan/desuchan_html.py b/chan/desuchan_html.py new file mode 100644 index 0000000..0a26312 --- /dev/null +++ b/chan/desuchan_html.py @@ -0,0 +1,80 @@ +import datetime + +from bs4 import BeautifulSoup + +from chan.helper import ChanHelper +from post_process import get_links_from_html_body + + +class DesuChanHtmlChanHelper(ChanHelper): + + def threads_url(self, board): + return "%s%s/" % (self._base_url, board) + + def posts_url(self, board, thread): + return "%s%s%s%d.html" % (self._base_url, board, self._thread_path, thread) + + @staticmethod + def item_id(item): + return item["id"] + + def item_urls(self, item, board): + return list(set(get_links_from_html_body(item["html"], self._base_url))) + + @staticmethod + def item_type(item): + return item["type"] + + @staticmethod + def thread_mtime(thread): + return -1 # TODO: Parse the 'X posts, Y images' span + + @staticmethod + def item_mtime(item): + return item["time"] + + def parse_threads_list(self, r): + soup = BeautifulSoup(r.text, "html.parser") + + threads = [] + + for threadEl in soup.find_all("div", id=lambda tid: tid and tid[1:].isdigit()): + threads.append({ + "id": int(threadEl.get("id")[1:]), + }) + + for form in soup.find_all("form"): + next_button = form.find("input", attrs={"value": "Next"}) + if next_button and form.get("action") != "none": + return threads, self._base_url.rstrip("/") + form.get("action") + return threads, None + + @staticmethod + def parse_thread(r): + soup = BeautifulSoup(r.text, "html.parser") + + op_el = None + for div in soup.find_all("div", id=lambda tid: tid and tid[1:].isdigit()): + op_el = div + break + + for post_el in op_el.find_all("table", recursive=False): + label = post_el.find("label") + if not label: + print(post_el) + *_, time = label.children + yield { + "id": int(post_el.find("td", attrs={"class", "reply"}).get("id")[5:]), + "type": "post", + "html": str(post_el), + "time": int(datetime.datetime.strptime(time, "\n%y/%m/%d(%a)%H:%M").timestamp()) + } + post_el.decompose() + + *_, time = op_el.find("label").children + yield { + "id": int(op_el.get("id")[1:]), + "type": "thread", + "html": str(op_el), + "time": int(datetime.datetime.strptime(time, "\n%y/%m/%d(%a)%H:%M").timestamp()) + }