add 0chan, inplement mtime for html boards

2025-04-24 12:15:50 +00:00 · 2019-09-08 22:09:20 -04:00 · 2019-09-08 22:09:20 -04:00 · 06d6762d51
commit 06d6762d51
parent b6c42c1db3
9 changed files with 130 additions and 25 deletions
--- a/chan/chan.py
+++ b/chan/chan.py
@ -9,6 +9,7 @@ from chan.mayuri import MayuriChanHelper
 from chan.russian_json import RussianJsonChanHelper
 from chan.synch_json import SynchJsonChanHelper
 from chan.tgchan_html import TgChanHtmlChanHelper
+from chan.zerochan_html import ZerochanHtmlChanHelper

 CHANS = {
    "4chan": JsonChanHelper(
@ -75,7 +76,7 @@ CHANS = {
        (
            "wiz", "dep", "hob", "lounge", "jp", "meta", "games", "music",
        ),
-        rps=1 / 30
+        rps=1 / 60
    ),
    # TODO
    # "1chan": ChanHelper(
@ -107,7 +108,7 @@ CHANS = {
            "a", "fd", "ja", "ma", "vn", "fg", "fur", "gg", "ga",
            "vape", "h", "ho", "hc", "e", "fet", "sex", "fag"
        ),
-        rps=1/5
+        rps=1 / 5
    ),
    "endchan": EndchanHtmlChanHelper(
        8,
@ -128,7 +129,7 @@ CHANS = {
            "ausneets", "qanonresearch", "polru", "yuri", "christianity",
            "kc", "rapport", "news", "brit", "webm", "4chon"
        ),
-        rps=1/2
+        rps=1 / 10
    ),
    "38chan": JsonChanHelper(
        9,
@ -151,7 +152,7 @@ CHANS = {
            "b", "pol", "sk", "int", "slav", "s", "gv", "mda", "sp",
            "fit", "had",
        ),
-        rps=1 / 30
+        rps=1 / 60
    ),
    "gnfos": JsonChanHelper(
        11,
@ -216,7 +217,7 @@ CHANS = {
        (
            "moe",
        ),
-        rps=1/20
+        rps=1 / 20
    ),
    "desuchan": DesuChanHtmlChanHelper(
        17,
@ -232,7 +233,7 @@ CHANS = {
            "arrrrr", "brocastan", "gar", "gif", "media", "ot", "r", "w",
            "sandbox", "sugg"
        ),
-        rps=1/10
+        rps=1 / 30
    ),
    "aurorachan": DesuChanHtmlChanHelper(
        18,
@ -244,7 +245,7 @@ CHANS = {
            "_bm", "de", "ic", "rp", "rpi", "v", "w", "tg",
            "alt", "b", "g", "pkmn", "yuri", "fl", "mu", "sugg"
        ),
-        rps=1/20
+        rps=1 / 20
    ),
    "tgchan": TgChanHtmlChanHelper(
        19,
@ -255,7 +256,7 @@ CHANS = {
        (
            "draw", "meep", "quest", "questdis", "tg", "icons",
        ),
-        rps=1/600,
+        rps=1 / 600,
    ),
    "lolnada": LolNadaHtmlChanHelper(
        20,
@ -267,7 +268,7 @@ CHANS = {
            "b", "a", "aw", "cgl", "dw", "int", "qt", "sad", "t",
            "toy", "v", "x", "34", "e", "f", "h"
        ),
-        rps=1/20,
+        rps=1 / 60,
    ),
    "fchan": FChanHtmlChanHelper(
        21,
@ -278,6 +279,22 @@ CHANS = {
        (
            "f", "m", "h", "s", "toon", "a", "ah", "c", "artist", "crit", "b"
        ),
-        rps=1/60,
+        rps=1 / 60,
    ),
+    "0chan": ZerochanHtmlChanHelper(
+        22,
+        "https://0-chan.ru/",
+        "https://0-chan.ru/",
+        "",
+        "/assets/",
+        (
+            "0", "0ch", "0chan", "1chan", "2ch", "3dprintor", "8", "\\_b", "a",
+            "an", "asylum", "bb", "bo", "c", "copypaste", "dog", "draw", "e",
+            "elite", "eot", "ergrgergre", "fido", "fur", "g", "game", "hui", "huz",
+            "hw", "ithub", "m", "meta", "naotoudigu", "nhc", "nullchan", "parasha",
+            "poligon", "postach", "psih", "r", "rm", "s", "shrek", "shy", "t",
+            "test", "tlp", "tmp", "tv", "vg", "vipe", "wh", "xikkadvach", "ynet"
+        ),
+        rps=1 / 5
+    )
 }
--- a/chan/desuchan_html.py
+++ b/chan/desuchan_html.py
@ -27,20 +27,22 @@ class DesuChanHtmlChanHelper(ChanHelper):

    @staticmethod
    def thread_mtime(thread):
-        return -1  # TODO: Parse the 'X posts, Y images' span
+        return thread["omit"]

    @staticmethod
    def item_mtime(item):
        return item["time"]

    def parse_threads_list(self, r):
-        soup = BeautifulSoup(r.text, "html.parser")
+        soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'), "html.parser")

        threads = []

        for threadEl in soup.find_all("div", id=lambda tid: tid and tid[1:].isdigit()):
+            omit = threadEl.find("span", class_="omittedposts")
            threads.append({
                "id": int(threadEl.get("id")[1:]),
+                "omit": int(omit.text.split(" ")[0]) if omit else 0
            })

        for form in soup.find_all("form"):
@ -51,7 +53,7 @@ class DesuChanHtmlChanHelper(ChanHelper):

    @staticmethod
    def parse_thread(r):
-        soup = BeautifulSoup(r.text, "html.parser")
+        soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'), "html.parser")

        op_el = soup.find("div", id=lambda tid: tid and tid[1:].isdigit())

--- a/chan/doushio_html.py
+++ b/chan/doushio_html.py
@ -28,20 +28,22 @@ class DoushioHtmlChanHelper(ChanHelper):

    @staticmethod
    def thread_mtime(thread):
-        return -1
+        return thread["omit"]

    @staticmethod
    def item_mtime(item):
        return item["time"]

    def parse_threads_list(self, r):
-        soup = BeautifulSoup(r.text, "html.parser")
+        soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'), "html.parser")

        threads = []

        for threadEl in soup.find_all("section"):
+            omit = threadEl.find("span", class_="omit")
            threads.append({
                "id": int(threadEl.get("id")),
+                "omit": int(omit.text.split(" ")[0]) if omit else 0
            })

        next_url = soup.find("link", attrs={"rel": "next"})
@ -51,7 +53,7 @@ class DoushioHtmlChanHelper(ChanHelper):

    @staticmethod
    def parse_thread(r):
-        soup = BeautifulSoup(r.text, "html.parser")
+        soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'), "html.parser")

        op_el = soup.find("section")
        for post_el in op_el.find_all("article"):
--- a/chan/endchan_html.py
+++ b/chan/endchan_html.py
@ -28,20 +28,22 @@ class EndchanHtmlChanHelper(ChanHelper):

    @staticmethod
    def thread_mtime(thread):
-        return -1
+        return thread["omit"]

    @staticmethod
    def item_mtime(item):
        return item["time"]

    def parse_threads_list(self, r):
-        soup = BeautifulSoup(r.text, "html.parser")
+        soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'), "html.parser")

        threads = []

        for threadEl in soup.find_all("div", attrs={"class": "opCell"}):
+            omit = threadEl.find("div", class_="labelOmission")
            threads.append({
                "id": int(threadEl.get("id")),
+                "omit": int(omit.text.split(" ")[0]) if omit else 0
            })

        next_url = soup.find("a", attrs={"id": "linkNext"})
@ -51,7 +53,7 @@ class EndchanHtmlChanHelper(ChanHelper):

    @staticmethod
    def parse_thread(r):
-        soup = BeautifulSoup(r.text, "html.parser")
+        soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'), "html.parser")

        op_el = soup.find("div", attrs={"class": "innerOP"})
        if not op_el:
--- a/chan/fchan_html.py
+++ b/chan/fchan_html.py
@ -1,5 +1,5 @@
 import datetime
-import json
+import _strptime
 import re
 from urllib.parse import urljoin

@ -31,7 +31,7 @@ class FChanHtmlChanHelper(DesuChanHtmlChanHelper):

    @staticmethod
    def parse_thread(r):
-        soup = BeautifulSoup(r.text, "html.parser")
+        soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'), "html.parser")

        op_el = soup.find("div", id=lambda tid: tid and re.match("thread[0-9]+", tid))

--- a/chan/helper.py
+++ b/chan/helper.py
@ -12,7 +12,7 @@ class ChanHelper:
        self.rps = rps

    def boards(self):
-        return [b for b in self._boards if not b.startswith("_")]
+        return [b.replace("\\_", "_") for b in self._boards if not b.startswith("_")]

    def image_url(self, board, tim, extension):
        return "%s%s%s%s%s" % (self._image_url, board, self._image_path, tim, extension)
--- a/chan/lolnada_html.py
+++ b/chan/lolnada_html.py
@ -32,7 +32,7 @@ class LolNadaHtmlChanHelper(ChanHelper):

    @staticmethod
    def thread_mtime(thread):
-        return -1
+        return thread["omit"]

    @staticmethod
    def item_mtime(item):
@ -44,9 +44,11 @@ class LolNadaHtmlChanHelper(ChanHelper):
        threads = []

        for threadEl in soup.find_all("div", class_="hilo"):
+            omit = threadEl.find("span", class_="omitted")
            threads.append({
                "id": int(threadEl.get("data-id")),
                "url": threadEl.find("a", class_="post_no").get("href"),
+                "omit": int(omit.get("data-omitidos")) if omit else 0
            })

        for form in soup.find_all("form"):
--- a/chan/tgchan_html.py
+++ b/chan/tgchan_html.py
@ -10,7 +10,7 @@ from chan.desuchan_html import DesuChanHtmlChanHelper
 class TgChanHtmlChanHelper(DesuChanHtmlChanHelper):

    def parse_threads_list(self, r):
-        soup = BeautifulSoup(r.text, "html.parser")
+        soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'), "html.parser")

        threads = []

@ -27,7 +27,7 @@ class TgChanHtmlChanHelper(DesuChanHtmlChanHelper):

    @staticmethod
    def parse_thread(r):
-        soup = BeautifulSoup(r.text, "html.parser")
+        soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'), "html.parser")

        op_el = soup.find("form", id="delform")

--- a/chan/zerochan_html.py
+++ b/chan/zerochan_html.py
@ -0,0 +1,80 @@
+import datetime
+import re
+from urllib.parse import urljoin
+
+from bs4 import BeautifulSoup
+
+from chan.doushio_html import DoushioHtmlChanHelper
+from post_process import get_links_from_html_body
+
+
+def _ru_datefmt(text):
+    # For some reason, the dates are not compatible with ru_RU.UTF-8...
+
+    return re.sub(r"\(.{3}\)", "", text) \
+        .replace("Янв", "Jan") \
+        .replace("Фев", "Feb") \
+        .replace("Мар", "Mar") \
+        .replace("Апр", "Apr") \
+        .replace("Май", "May") \
+        .replace("Июн", "Jun") \
+        .replace("Июл", "Jul") \
+        .replace("Авг", "Aug") \
+        .replace("Сеп", "Sep") \
+        .replace("Окт", "Oct") \
+        .replace("Ноя", "Nov") \
+        .replace("Дек", "Dec")
+
+
+class ZerochanHtmlChanHelper(DoushioHtmlChanHelper):
+
+    @staticmethod
+    def thread_mtime(thread):
+        return thread["omit"]
+
+    def item_urls(self, item, board):
+        return [
+            x for
+            x in set(get_links_from_html_body(item["html"], self._base_url))
+            if "google.com" not in x and "whatanime.ga" not in x and "iqdb.org" not in x and "saucenao.com" not in x
+        ]
+
+    def parse_threads_list(self, r):
+        soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'), "html.parser")
+
+        threads = []
+
+        for threadEl in soup.find_all("section", attrs={"data-id": lambda x: x}):
+            omit = threadEl.find("span", class_="omit")
+            threads.append({
+                "id": int(threadEl.get("data-id")),
+                "omit": int(omit.get("data-omit")) if omit else 0
+            })
+
+        for a in soup.find_all("a"):
+            if a.text == ">":
+                return threads, urljoin(r.url, a.get("href"))
+        return threads, None
+
+    @staticmethod
+    def parse_thread(r):
+        soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'), "html.parser")
+
+        op_el = soup.find("section", attrs={"data-id": lambda x: x})
+
+        for post_el in op_el.find_all("article", attrs={"data-id": lambda x: x}):
+            yield {
+                "id": int(post_el.get("data-id")),
+                "type": "post",
+                "html": str(post_el),
+                "time": int(datetime.datetime.strptime(_ru_datefmt(post_el.find("time").text),
+                                                       "%d %b %Y %H:%M").timestamp())
+            }
+            post_el.decompose()
+        yield {
+            "id": int(op_el.get("data-id")[1:]),
+            "type": "thread",
+            "html": str(op_el),
+            "time": int(datetime.datetime.strptime(_ru_datefmt(op_el.find("time").text),
+                                                   "%d %b %Y %H:%M").timestamp())
+        }