add desuchan

2025-12-12 04:28:51 +00:00 · 2019-09-08 15:50:05 -04:00
parent 18247d4139
commit e13770d603
3 changed files with 100 additions and 2 deletions
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@ image boards and publishes serialised JSON to RabbitMQ
 
 Compatible image boards: 4chan, lainchan, uboachan,
 22chan, wizchan, 1chan, 2ch.hk, endchan, 38chan, alokal,
-horochan, doushio.
+horochan, doushio, desuchan.

 Can optionally push monitoring data to InfluxDB. Below is an
 example of Grafana being used to display it.
--- a/chan/chan.py
+++ b/chan/chan.py
@@ -1,4 +1,5 @@
 from chan.alokal_json import AlokalJsonChanHelper
+from chan.desuchan_html import DesuChanHtmlChanHelper
 from chan.doushio_html import DoushioHtmlChanHelper
 from chan.endchan_html import EndchanHtmlChanHelper
 from chan.json import JsonChanHelper
@@ -213,5 +214,22 @@ CHANS = {
            "moe",
        ),
        rps=1
-    )
+    ),
+    "desuchan": DesuChanHtmlChanHelper(
+        17,
+        "https://desuchan.net/",
+        "https://desuchan.net/",
+        "/res/",
+        "/src/",
+        (
+            "bananas", "boku", "dawa", "desu", "jum", "kashira", "md",
+            "otousama", "ro", "unyuu", "yakult", "a", "c", "h", "_loli",
+            "moonspeak", "nagato", "nij", "nipa", "touhou", "tr", "yan",
+            "yan", "vn", "do", "fi", "lit", "o", "pro", "tech", "v", "vic",
+            "arrrrr", "brocastan", "gar", "gif", "media", "ot", "r", "w",
+            "sandbox", "sugg"
+        ),
+        rps=1/10
+    ),
+
 }
--- a/chan/desuchan_html.py
+++ b/chan/desuchan_html.py
@@ -0,0 +1,80 @@
+import datetime
+
+from bs4 import BeautifulSoup
+
+from chan.helper import ChanHelper
+from post_process import get_links_from_html_body
+
+
+class DesuChanHtmlChanHelper(ChanHelper):
+
+    def threads_url(self, board):
+        return "%s%s/" % (self._base_url, board)
+
+    def posts_url(self, board, thread):
+        return "%s%s%s%d.html" % (self._base_url, board, self._thread_path, thread)
+
+    @staticmethod
+    def item_id(item):
+        return item["id"]
+
+    def item_urls(self, item, board):
+        return list(set(get_links_from_html_body(item["html"], self._base_url)))
+
+    @staticmethod
+    def item_type(item):
+        return item["type"]
+
+    @staticmethod
+    def thread_mtime(thread):
+        return -1  # TODO: Parse the 'X posts, Y images' span
+
+    @staticmethod
+    def item_mtime(item):
+        return item["time"]
+
+    def parse_threads_list(self, r):
+        soup = BeautifulSoup(r.text, "html.parser")
+
+        threads = []
+
+        for threadEl in soup.find_all("div", id=lambda tid: tid and tid[1:].isdigit()):
+            threads.append({
+                "id": int(threadEl.get("id")[1:]),
+            })
+
+        for form in soup.find_all("form"):
+            next_button = form.find("input", attrs={"value": "Next"})
+            if next_button and form.get("action") != "none":
+                return threads, self._base_url.rstrip("/") + form.get("action")
+        return threads, None
+
+    @staticmethod
+    def parse_thread(r):
+        soup = BeautifulSoup(r.text, "html.parser")
+
+        op_el = None
+        for div in soup.find_all("div", id=lambda tid: tid and tid[1:].isdigit()):
+            op_el = div
+            break
+
+        for post_el in op_el.find_all("table", recursive=False):
+            label = post_el.find("label")
+            if not label:
+                print(post_el)
+            *_, time = label.children
+            yield {
+                "id": int(post_el.find("td", attrs={"class", "reply"}).get("id")[5:]),
+                "type": "post",
+                "html": str(post_el),
+                "time": int(datetime.datetime.strptime(time, "\n%y/%m/%d(%a)%H:%M").timestamp())
+            }
+            post_el.decompose()
+
+        *_, time = op_el.find("label").children
+        yield {
+            "id": int(op_el.get("id")[1:]),
+            "type": "thread",
+            "html": str(op_el),
+            "time": int(datetime.datetime.strptime(time, "\n%y/%m/%d(%a)%H:%M").timestamp())
+        }