add desuchan

2025-12-17 05:59:02 +00:00 · 2019-09-08 15:50:05 -04:00
parent 18247d4139
commit e13770d603
3 changed files with 100 additions and 2 deletions
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@ image boards and publishes serialised JSON to RabbitMQ
 Compatible image boards: 4chan, lainchan, uboachan,
 22chan, wizchan, 1chan, 2ch.hk, endchan, 38chan, alokal,
-horochan, doushio.
+horochan, doushio, desuchan.
 Can optionally push monitoring data to InfluxDB. Below is an
 example of Grafana being used to display it.
--- a/chan/chan.py
+++ b/chan/chan.py
@@ -1,4 +1,5 @@
 from chan.alokal_json import AlokalJsonChanHelper
 from chan.desuchan_html import DesuChanHtmlChanHelper
 from chan.doushio_html import DoushioHtmlChanHelper
 from chan.endchan_html import EndchanHtmlChanHelper
 from chan.json import JsonChanHelper
@@ -213,5 +214,22 @@ CHANS = {
            "moe",
        ),
        rps=1
-    )
+    ),
    "desuchan": DesuChanHtmlChanHelper(
        17,
        "https://desuchan.net/",
        "https://desuchan.net/",
        "/res/",
        "/src/",
        (
            "bananas", "boku", "dawa", "desu", "jum", "kashira", "md",
            "otousama", "ro", "unyuu", "yakult", "a", "c", "h", "_loli",
            "moonspeak", "nagato", "nij", "nipa", "touhou", "tr", "yan",
            "yan", "vn", "do", "fi", "lit", "o", "pro", "tech", "v", "vic",
            "arrrrr", "brocastan", "gar", "gif", "media", "ot", "r", "w",
            "sandbox", "sugg"
        ),
        rps=1/10
    ),
 }
--- a/chan/desuchan_html.py
+++ b/chan/desuchan_html.py
@@ -0,0 +1,80 @@
 import datetime
 from bs4 import BeautifulSoup
 from chan.helper import ChanHelper
 from post_process import get_links_from_html_body
 class DesuChanHtmlChanHelper(ChanHelper):
    def threads_url(self, board):
        return "%s%s/" % (self._base_url, board)
    def posts_url(self, board, thread):
        return "%s%s%s%d.html" % (self._base_url, board, self._thread_path, thread)
    @staticmethod
    def item_id(item):
        return item["id"]
    def item_urls(self, item, board):
        return list(set(get_links_from_html_body(item["html"], self._base_url)))
    @staticmethod
    def item_type(item):
        return item["type"]
    @staticmethod
    def thread_mtime(thread):
        return -1  # TODO: Parse the 'X posts, Y images' span
    @staticmethod
    def item_mtime(item):
        return item["time"]
    def parse_threads_list(self, r):
        soup = BeautifulSoup(r.text, "html.parser")
        threads = []
        for threadEl in soup.find_all("div", id=lambda tid: tid and tid[1:].isdigit()):
            threads.append({
                "id": int(threadEl.get("id")[1:]),
            })
        for form in soup.find_all("form"):
            next_button = form.find("input", attrs={"value": "Next"})
            if next_button and form.get("action") != "none":
                return threads, self._base_url.rstrip("/") + form.get("action")
        return threads, None
    @staticmethod
    def parse_thread(r):
        soup = BeautifulSoup(r.text, "html.parser")
        op_el = None
        for div in soup.find_all("div", id=lambda tid: tid and tid[1:].isdigit()):
            op_el = div
            break
        for post_el in op_el.find_all("table", recursive=False):
            label = post_el.find("label")
            if not label:
                print(post_el)
            *_, time = label.children
            yield {
                "id": int(post_el.find("td", attrs={"class", "reply"}).get("id")[5:]),
                "type": "post",
                "html": str(post_el),
                "time": int(datetime.datetime.strptime(time, "\n%y/%m/%d(%a)%H:%M").timestamp())
            }
            post_el.decompose()
        *_, time = op_el.find("label").children
        yield {
            "id": int(op_el.get("id")[1:]),
            "type": "thread",
            "html": str(op_el),
            "time": int(datetime.datetime.strptime(time, "\n%y/%m/%d(%a)%H:%M").timestamp())
        }