refactor chan.py, update endchan, add doushio

2025-10-26 05:06:51 +00:00 · 2019-09-08 14:42:15 -04:00 · 2019-09-08 14:42:15 -04:00 · 18247d4139
commit 18247d4139
parent 175d0bebaa
15 changed files with 679 additions and 530 deletions
--- a/README.md
+++ b/README.md
@ -6,7 +6,7 @@ image boards and publishes serialised JSON to RabbitMQ
 Compatible image boards: 4chan, lainchan, uboachan,
 22chan, wizchan, 1chan, 2ch.hk, endchan, 38chan, alokal,
-horochan.
+horochan, doushio.
 Can optionally push monitoring data to InfluxDB. Below is an
 example of Grafana being used to display it.
--- a/init.py
+++ b/init.py
--- a/chan.py
+++ b/chan.py
@ -1,527 +0,0 @@
 import json
 from json import JSONDecodeError
 from urllib.parse import urljoin
 from bs4 import BeautifulSoup
 from post_process import get_links_from_body
 from util import logger
 class ChanHelper:
    def __init__(self, db_id, base_url, image_url, thread_path, image_path, boards, rps):
        self.db_id = db_id
        self._base_url = base_url
        self._image_url = image_url
        self._thread_path = thread_path
        self._image_path = image_path
        self._boards = boards
        self.rps = rps
    def boards(self):
        return [b for b in self._boards if not b.startswith("_")]
    def image_url(self, board, tim, extension):
        return "%s%s%s%s%s" % (self._image_url, board, self._image_path, tim, extension)
    def threads_url(self, board):
        return "%s%s/threads.json" % (self._base_url, board)
    def posts_url(self, board, thread):
        return "%s%s%s%d.json" % (self._base_url, board, self._thread_path, thread)
    def board_hash(self, board):
        return str((self._boards.index(board) + 1) * 10000)
    @staticmethod
    def item_id(item):
        raise NotImplementedError
    @staticmethod
    def item_mtime(item):
        raise NotImplementedError
    def item_unique_id(self, item, board):
        return int(self.board_hash(board) + str(self.item_id(item)))
    @staticmethod
    def thread_mtime(thread):
        raise NotImplementedError
    def item_urls(self, item, board):
        raise NotImplementedError
    @staticmethod
    def item_type(item):
        raise NotImplementedError
    @staticmethod
    def parse_threads_list(r):
        raise NotImplementedError
    @staticmethod
    def parse_thread(r):
        raise NotImplementedError
 class HtmlChanHelper(ChanHelper):
    def threads_url(self, board):
        return "%s%s/" % (self._base_url, board)
    def posts_url(self, board, thread):
        return "%s%s%s%d.html" % (self._base_url, board, self._thread_path, thread)
    @staticmethod
    def item_id(item):
        return item["id"]
    def item_urls(self, item, board):
        return []
    @staticmethod
    def item_type(item):
        return item["type"]
    @staticmethod
    def thread_mtime(thread):
        return -1
    @staticmethod
    def item_mtime(item):
        return 0  # TODO
    def parse_threads_list(self, r):
        soup = BeautifulSoup(r.text, "html.parser")
        threads = []
        for threadEl in soup.find_all("div", attrs={"class": "opCell"}):
            threads.append({
                "id": int(threadEl.get("id")),
            })
        next_url = soup.find("a", attrs={"id": "linkNext"})
        if next_url:
            return threads, urljoin(r.url, next_url.get("href"))
        return threads, None
    @staticmethod
    def parse_thread(r):
        soup = BeautifulSoup(r.text, "html.parser")
        op_el = soup.find("div", attrs={"class": "innerOP"})
        yield {
            "id": int(soup.find("div", class_="opCell").get("id")),
            "type": "thread",
            "html": str(op_el),
        }
        for post_el in soup.find_all("div", class_="postCell"):
            yield {
                "id": int(post_el.get("id")),
                "type": "post",
                "html": str(post_el),
            }
 class JsonChanHelper(ChanHelper):
    @staticmethod
    def item_id(item):
        return item["no"]
    @staticmethod
    def item_mtime(item):
        return item["time"]
    def item_urls(self, item, board):
        urls = set()
        if "com" in item and item["com"]:
            urls.update(get_links_from_body(item["com"]))
        elif "sub" in item and item["sub"]:
            urls.update(get_links_from_body(item["sub"]))
        if "fsize" in item and item["fsize"]:
            urls.add(self.image_url(board, item["tim"], item["ext"]))
        return list(urls)
    @staticmethod
    def item_type(item):
        return "thread" if "sub" in item else "post"
    @staticmethod
    def thread_mtime(thread):
        return thread["last_modified"]
    @staticmethod
    def parse_threads_list(r):
        try:
            j = json.loads(r.text)
            if len(j) == 0 or "threads" not in j[0]:
                logger.warning("No threads in response for %s: %s" % (r.url, r.text,))
                return [], None
        except JSONDecodeError:
            logger.warning("JSONDecodeError for %s:" % (r.url,))
            logger.warning(r.text)
            return [], None
        threads = []
        for page in j:
            for thread in page["threads"]:
                threads.append(thread)
        return threads, None
    @staticmethod
    def parse_thread(r):
        j = json.loads(r.text)
        return j["posts"]
 class RussianJsonChanHelper(ChanHelper):
    @staticmethod
    def item_id(item):
        return int(item["num"])
    @staticmethod
    def item_mtime(item):
        return item["timestamp"]
    @staticmethod
    def parse_threads_list(r):
        try:
            j = json.loads(r.text)
        except JSONDecodeError:
            logger.warning("JSONDecodeError for %s:" % (r.url,))
            logger.warning(r.text)
            return [], None
        return j["threads"], None
    @staticmethod
    def parse_thread(r):
        j = json.loads(r.text)
        for thread in j["threads"]:
            for post in thread["posts"]:
                yield post
    @staticmethod
    def thread_mtime(thread):
        return thread["posts_count"]
    @staticmethod
    def item_type(item):
        return "thread" if "subject" in item and item["subject"] != "" else "post"
    def item_urls(self, item, board):
        urls = set()
        if "comment" in item and item["comment"]:
            urls.update(get_links_from_body(item["comment"]))
        elif "subject" in item and item["subject"]:
            urls.update(get_links_from_body(item["subject"]))
        for file in item["files"]:
            urls.add(self._base_url.rstrip("/") + file["path"])
        return list(urls)
 class AlokalJsonChanHelper(JsonChanHelper):
    def item_urls(self, item, board):
        urls = set()
        if "com" in item and item["com"]:
            urls.update(get_links_from_body(item["com"]))
        elif "sub" in item and item["sub"]:
            urls.update(get_links_from_body(item["sub"]))
        if "fsize" in item and item["fsize"]:
            urls.add(self._image_url + self._image_path + item["tim"] + "/" + str(item["no"]) + item["ext"])
        return list(urls)
 class SynchJsonChanHelper(JsonChanHelper):
    def item_urls(self, item, board):
        urls = set()
        if "com" in item and item["com"]:
            urls.update(get_links_from_body(item["com"]))
        elif "sub" in item and item["sub"]:
            urls.update(get_links_from_body(item["sub"]))
        if "fsize" in item and item["fsize"]:
            urls.add(self._image_url + self._image_path + item["tim"] + item["ext"])
        return list(urls)
 class MayuriChanHelper(ChanHelper):
    def __init__(self, db_id, base_url, image_url, boards, rps):
        super().__init__(db_id, base_url, image_url, None, None, boards, rps)
    @staticmethod
    def item_id(item):
        return item["id"]
    @staticmethod
    def item_mtime(item):
        return item["timestamp"]
    @staticmethod
    def thread_mtime(thread):
        return thread["replies_count"]
    def item_urls(self, item, board):
        urls = set()
        if "message" in item and item["message"]:
            urls.update(get_links_from_body(item["message"]))
        elif "subject" in item and item["subject"]:
            urls.update(get_links_from_body(item["subject"]))
        if item["files"]:
            for file in item["files"]:
                urls.add(self._image_url % file["storage"] + file["name"] + "." + file["ext"])
        return list(urls)
    @staticmethod
    def item_type(item):
        return "thread" if "replies_count" in item else "post"
    def parse_threads_list(self, r):
        try:
            j = json.loads(r.text)
        except JSONDecodeError:
            logger.warning("JSONDecodeError for %s:" % (r.url,))
            logger.warning(r.text)
            return [], None
        if j["currentPage"] < j["totalPages"]:
            return j["data"], self._base_url + "boards/%d" % (j["currentPage"] + 1, )
        return j["data"]
    @staticmethod
    def parse_thread(r):
        try:
            j = json.loads(r.text)
        except JSONDecodeError:
            logger.warning("JSONDecodeError for %s:" % (r.url,))
            logger.warning(r.text)
            return []
        thread = dict(j["data"])
        del thread["replies"]
        yield thread
        if j["data"]["replies"]:
            for post in j["data"]["replies"]:
                yield post
    def threads_url(self, board):
        return "%sboards/1" % (self._base_url, )
    def posts_url(self, board, thread):
        return "%sthreads/%d" % (self._base_url, thread)
 CHANS = {
    "4chan": JsonChanHelper(
        1,
        "https://a.4cdn.org/",
        "https://i.4cdn.org/",
        "/thread/",
        "/",
        (
            "a", "b", "c", "d", "e", "f", "g", "gif", "h", "hr",
            "k", "m", "o", "p", "r", "s", "t", "u", "v", "vg",
            "vr", "w", "wg", "i", "ic", "r9k", "s4s", "vip", "qa",
            "cm", "hm", "lgbt", "y", "3", "aco", "adv", "an", "asp",
            "bant", "biz", "cgl", "ck", "co", "diy", "fa", "fit",
            "gd", "hc", "his", "int", "jp", "lit", "mlp", "mu", "n",
            "news", "out", "po", "pol", "qst", "sci", "soc", "sp",
            "tg", "toy", "trv", "tv", "vp", "wsg", "wsr", "x"
        ),
        rps=2
    ),
    "lainchan": JsonChanHelper(
        2,
        "https://lainchan.org/",
        "https://lainchan.org/",
        "/res/",
        "/src/",
        (
            "λ", "diy", "sec", "tech", "inter", "lit", "music", "vis",
            "hum", "drg", "zzz", "layer", "q", "r", "_cult", "_psy",
            "_mega",
        ),
        rps=1 / 60
    ),
    "uboachan": JsonChanHelper(
        3,
        "https://uboachan.net/",
        "https://uboachan.net/",
        "/res/",
        "/src/",
        (
            "yn", "yndd", "fg", "yume", "o", "lit", "media", "og",
            "ig", "2", "ot", "hikki", "cc", "x", "sugg"
        ),
        rps=1 / 120
    ),
    "22chan": JsonChanHelper(
        4,
        "https://22chan.org/",
        "https://22chan.org/",
        "/res/",
        "/src/",
        (
            "a", "b", "f", "yu", "i", "k", "mu", "pol", "sewers",
            "sg", "t", "vg"
        ),
        rps=1 / 120
    ),
    "wizchan": JsonChanHelper(
        5,
        "https://wizchan.org/",
        "https://wizchan.org/",
        "/res/",
        "/src/",
        (
            "wiz", "dep", "hob", "lounge", "jp", "meta", "games", "music",
        ),
        rps=1 / 30
    ),
    # TODO
    "1chan": ChanHelper(
        6,
        "https://www.1chan.net/",
        "https://www.1chan.net/",
        "/res/",
        "/src/",
        (
            "rails"
        ),
        rps=1 / 600
    ),
    "2chhk": RussianJsonChanHelper(
        7,
        "https://2ch.hk/",
        "https://2ch.hk/",
        "/res/",
        "/src/",
        (
            "d", "b", "o", "soc", "media", "r", "api", "rf", "int",
            "po", "news", "hry", "au", "bi", "biz", "bo", "c", "em",
            "fa", "fiz", "fl", "ftb", "hh", "hi", "me", "mg", "mlp",
            "mo", "mov", "mu", "ne", "psy", "re",
            "sci", "sf", "sn", "sp", "spc", "tv", "un", "w", "wh",
            "wm", "wp", "zog", "de", "di", "diy", "mus", "pa", "p",
            "wrk", "trv", "gd", "hw", "mobi", "pr", "ra", "s", "t",
            "web", "bg", "cg", "gsg", "ruvn", "tes", "v", "vg", "wr",
            "a", "fd", "ja", "ma", "vn", "fg", "fur", "gg", "ga",
            "vape", "h", "ho", "hc", "e", "fet", "sex", "fag"
        ),
        rps=1
    ),
    "endchan": HtmlChanHelper(
        8,
        "https://endchan.net/",
        "https://endchan.net/",
        "/res/",
        "/.media/",
        (
            "art", "film", "oekaki", "draw",
            "adv", "r9k", "hope", "spoon",
            "a", "am", "amr", "l", "monster", "m", "2hu", "animach",
            "b", "webm", "v", "vvv", "vidya", "tg", "otomad", "mu",
            "metal", "tv", "f", "clipuploads",
            "4", "deutsch", "j", "jp" "italia", "fr", "kc", "kurenai", "int",
            "intl", "lang", "librejp", "rzabczan", "55chan",
            "pol", "pdfs", "his", "ggrevols", "horror", "aethism",
            "tech", "g", "markov", "os", "agdg", "cyber", "HTML", "2600",
            "ausneets", "qanonresearch", "polru", "yuri", "christianity",
            "kc", "rapport", "news", "brit", "webm", "4chon"
        ),
        rps=1
    ),
    "38chan": JsonChanHelper(
        9,
        "http://38chan.net/",
        "http://38chan.net/",
        "/res/",
        "/src/",
        (
            "a", "b", "g", "38"
        ),
        rps=1 / 600
    ),
    "alokal": AlokalJsonChanHelper(
        10,
        "https://alokal.eu/",
        "https://alokal.eu/",
        "/",
        "src/",
        (
            "b", "pol", "sk", "int", "slav", "s", "gv", "mda", "sp",
            "fit", "had",
        ),
        rps=1 / 4
    ),
    "gnfos": JsonChanHelper(
        11,
        "https://gnfos.com/",
        "https://gnfos.com/",
        "/res/",
        "/src/",
        (
            "jp", "drive"
        ),
        rps=1 / 60
    ),
    "synch": SynchJsonChanHelper(
        12,
        "https://syn-ch.ru/",
        "https://cdn.syn-ch.ru/",
        "/res/",
        "src",
        (
            "b", "d", "_r", "a", "_g", "mlp", "mu", "_tv", "vg",
            "_wh", "old", "test"
        ),
        rps=1 / 120
    ),
    "tahta": JsonChanHelper(
        13,
        "https://tahta.ch/",
        "https://tahta.ch/",
        "/res/",
        "/src/",
        (
            "b", "g", "s", "v"
        ),
        rps=1 / 300
    ),
    "awsumchan": JsonChanHelper(
        14,
        "https://awsumchan.org/",
        "https://awsumchan.org/",
        "/res/",
        "/src/",
        (
            "an", "aw", "cr", "fi", "ra", "au", "ga", "he", "sp"
        ),
        rps=1 / 600
    ),
    "horochan": MayuriChanHelper(
        15,
        "https://api.horochan.ru/v1/",
        "https://%s.horochan.ru/src/",
        (
            "b"
        ),
        rps=1/20
    ),
 }
--- a/chan/init.py
+++ b/chan/init.py
--- a/chan/alokal_json.py
+++ b/chan/alokal_json.py
@ -0,0 +1,17 @@
 from chan.json import JsonChanHelper
 from post_process import get_links_from_body
 class AlokalJsonChanHelper(JsonChanHelper):
    def item_urls(self, item, board):
        urls = set()
        if "com" in item and item["com"]:
            urls.update(get_links_from_body(item["com"]))
        elif "sub" in item and item["sub"]:
            urls.update(get_links_from_body(item["sub"]))
        if "fsize" in item and item["fsize"]:
            urls.add(self._image_url + self._image_path + item["tim"] + "/" + str(item["no"]) + item["ext"])
        return list(urls)
--- a/chan/chan.py
+++ b/chan/chan.py
@ -0,0 +1,217 @@
 from chan.alokal_json import AlokalJsonChanHelper
 from chan.doushio_html import DoushioHtmlChanHelper
 from chan.endchan_html import EndchanHtmlChanHelper
 from chan.json import JsonChanHelper
 from chan.mayuri import MayuriChanHelper
 from chan.russian_json import RussianJsonChanHelper
 from chan.synch_json import SynchJsonChanHelper
 CHANS = {
    "4chan": JsonChanHelper(
        1,
        "https://a.4cdn.org/",
        "https://i.4cdn.org/",
        "/thread/",
        "/",
        (
            "a", "b", "c", "d", "e", "f", "g", "gif", "h", "hr",
            "k", "m", "o", "p", "r", "s", "t", "u", "v", "vg",
            "vr", "w", "wg", "i", "ic", "r9k", "s4s", "vip", "qa",
            "cm", "hm", "lgbt", "y", "3", "aco", "adv", "an", "asp",
            "bant", "biz", "cgl", "ck", "co", "diy", "fa", "fit",
            "gd", "hc", "his", "int", "jp", "lit", "mlp", "mu", "n",
            "news", "out", "po", "pol", "qst", "sci", "soc", "sp",
            "tg", "toy", "trv", "tv", "vp", "wsg", "wsr", "x"
        ),
        rps=2
    ),
    "lainchan": JsonChanHelper(
        2,
        "https://lainchan.org/",
        "https://lainchan.org/",
        "/res/",
        "/src/",
        (
            "λ", "diy", "sec", "tech", "inter", "lit", "music", "vis",
            "hum", "drg", "zzz", "layer", "q", "r", "_cult", "_psy",
            "_mega",
        ),
        rps=1 / 60
    ),
    "uboachan": JsonChanHelper(
        3,
        "https://uboachan.net/",
        "https://uboachan.net/",
        "/res/",
        "/src/",
        (
            "yn", "yndd", "fg", "yume", "o", "lit", "media", "og",
            "ig", "2", "ot", "hikki", "cc", "x", "sugg"
        ),
        rps=1 / 120
    ),
    "22chan": JsonChanHelper(
        4,
        "https://22chan.org/",
        "https://22chan.org/",
        "/res/",
        "/src/",
        (
            "a", "b", "f", "yu", "i", "k", "mu", "pol", "sewers",
            "sg", "t", "vg"
        ),
        rps=1 / 120
    ),
    "wizchan": JsonChanHelper(
        5,
        "https://wizchan.org/",
        "https://wizchan.org/",
        "/res/",
        "/src/",
        (
            "wiz", "dep", "hob", "lounge", "jp", "meta", "games", "music",
        ),
        rps=1 / 30
    ),
    # TODO
    # "1chan": ChanHelper(
    #     6,
    #     "https://www.1chan.net/",
    #     "https://www.1chan.net/",
    #     "/res/",
    #     "/src/",
    #     (
    #         "rails"
    #     ),
    #     rps=1 / 600
    # ),
    "2chhk": RussianJsonChanHelper(
        7,
        "https://2ch.hk/",
        "https://2ch.hk/",
        "/res/",
        "/src/",
        (
            "d", "b", "o", "soc", "media", "r", "api", "rf", "int",
            "po", "news", "hry", "au", "bi", "biz", "bo", "c", "em",
            "fa", "fiz", "fl", "ftb", "hh", "hi", "me", "mg", "mlp",
            "mo", "mov", "mu", "ne", "psy", "re",
            "sci", "sf", "sn", "sp", "spc", "tv", "un", "w", "wh",
            "wm", "wp", "zog", "de", "di", "diy", "mus", "pa", "p",
            "wrk", "trv", "gd", "hw", "mobi", "pr", "ra", "s", "t",
            "web", "bg", "cg", "gsg", "ruvn", "tes", "v", "vg", "wr",
            "a", "fd", "ja", "ma", "vn", "fg", "fur", "gg", "ga",
            "vape", "h", "ho", "hc", "e", "fet", "sex", "fag"
        ),
        rps=1
    ),
    "endchan": EndchanHtmlChanHelper(
        8,
        "https://endchan.net/",
        "https://endchan.net/",
        "/res/",
        "/.media/",
        (
            "art", "film", "oekaki", "draw",
            "adv", "r9k", "hope", "spoon",
            "a", "am", "amr", "l", "monster", "m", "2hu", "animach",
            "b", "webm", "v", "vvv", "vidya", "tg", "otomad", "mu",
            "metal", "tv", "f", "clipuploads",
            "4", "deutsch", "j", "jp" "italia", "fr", "kc", "kurenai", "int",
            "intl", "lang", "librejp", "rzabczan", "55chan",
            "pol", "pdfs", "his", "ggrevols", "horror", "aethism",
            "tech", "g", "markov", "os", "agdg", "cyber", "HTML", "2600",
            "ausneets", "qanonresearch", "polru", "yuri", "christianity",
            "kc", "rapport", "news", "brit", "webm", "4chon"
        ),
        rps=1
    ),
    "38chan": JsonChanHelper(
        9,
        "http://38chan.net/",
        "http://38chan.net/",
        "/res/",
        "/src/",
        (
            "a", "b", "g", "38"
        ),
        rps=1 / 600
    ),
    "alokal": AlokalJsonChanHelper(
        10,
        "https://alokal.eu/",
        "https://alokal.eu/",
        "/",
        "src/",
        (
            "b", "pol", "sk", "int", "slav", "s", "gv", "mda", "sp",
            "fit", "had",
        ),
        rps=1 / 4
    ),
    "gnfos": JsonChanHelper(
        11,
        "https://gnfos.com/",
        "https://gnfos.com/",
        "/res/",
        "/src/",
        (
            "jp", "drive"
        ),
        rps=1 / 60
    ),
    "synch": SynchJsonChanHelper(
        12,
        "https://syn-ch.ru/",
        "https://cdn.syn-ch.ru/",
        "/res/",
        "src",
        (
            "b", "d", "_r", "a", "_g", "mlp", "mu", "_tv", "vg",
            "_wh", "old", "test"
        ),
        rps=1 / 120
    ),
    "tahta": JsonChanHelper(
        13,
        "https://tahta.ch/",
        "https://tahta.ch/",
        "/res/",
        "/src/",
        (
            "b", "g", "s", "v"
        ),
        rps=1 / 300
    ),
    "awsumchan": JsonChanHelper(
        14,
        "https://awsumchan.org/",
        "https://awsumchan.org/",
        "/res/",
        "/src/",
        (
            "an", "aw", "cr", "fi", "ra", "au", "ga", "he", "sp"
        ),
        rps=1 / 600
    ),
    "horochan": MayuriChanHelper(
        15,
        "https://api.horochan.ru/v1/",
        "https://%s.horochan.ru/src/",
        (
            "b",
        ),
        rps=1 / 20
    ),
    "doushio": DoushioHtmlChanHelper(
        16,
        "http://doushio.com/",
        "http://doushio.com/",
        "",
        "/ass/",
        (
            "moe",
        ),
        rps=1
    )
 }
--- a/chan/doushio_html.py
+++ b/chan/doushio_html.py
@ -0,0 +1,75 @@
 from urllib.parse import urljoin
 from dateutil import parser
 from bs4 import BeautifulSoup
 from chan.helper import ChanHelper
 from post_process import get_links_from_html_body
 class DoushioHtmlChanHelper(ChanHelper):
    def threads_url(self, board):
        return "%s%s/" % (self._base_url, board)
    def posts_url(self, board, thread):
        return "%s%s/%d" % (self._base_url, board, thread)
    @staticmethod
    def item_id(item):
        return item["id"]
    def item_urls(self, item, board):
        return list(set(get_links_from_html_body(item["html"], self._base_url)))
    @staticmethod
    def item_type(item):
        return item["type"]
    @staticmethod
    def thread_mtime(thread):
        return -1
    @staticmethod
    def item_mtime(item):
        return item["time"]
    def parse_threads_list(self, r):
        soup = BeautifulSoup(r.text, "html.parser")
        threads = []
        for threadEl in soup.find_all("section"):
            threads.append({
                "id": int(threadEl.get("id")),
            })
        next_url = soup.find("link", attrs={"rel": "next"})
        if next_url:
            return threads, urljoin(r.url, next_url.get("href"))
        return threads, None
    @staticmethod
    def parse_thread(r):
        soup = BeautifulSoup(r.text, "html.parser")
        op_el = soup.find("section")
        for post_el in op_el.find_all("article"):
            yield {
                "id": int(post_el.get("id")),
                "type": "post",
                "html": str(post_el),
                "time": int(parser.parse(post_el.find("header").find("time").get("datetime")).timestamp())
            }
            post_el.decompose()
        yield {
            "id": int(op_el.get("id")),
            "type": "thread",
            "html": str(op_el),
            "time": int(parser.parse(op_el.find("header").find("time").get("datetime")).timestamp())
        }
--- a/chan/endchan_html.py
+++ b/chan/endchan_html.py
@ -0,0 +1,74 @@
 import datetime
 from urllib.parse import urljoin
 from bs4 import BeautifulSoup
 from chan.helper import ChanHelper
 from post_process import get_links_from_html_body
 class EndchanHtmlChanHelper(ChanHelper):
    def threads_url(self, board):
        return "%s%s/" % (self._base_url, board)
    def posts_url(self, board, thread):
        return "%s%s%s%d.html" % (self._base_url, board, self._thread_path, thread)
    @staticmethod
    def item_id(item):
        return item["id"]
    def item_urls(self, item, board):
        return list(set(get_links_from_html_body(item["html"], self._base_url)))
    @staticmethod
    def item_type(item):
        return item["type"]
    @staticmethod
    def thread_mtime(thread):
        return -1
    @staticmethod
    def item_mtime(item):
        return item["time"]
    def parse_threads_list(self, r):
        soup = BeautifulSoup(r.text, "html.parser")
        threads = []
        for threadEl in soup.find_all("div", attrs={"class": "opCell"}):
            threads.append({
                "id": int(threadEl.get("id")),
            })
        next_url = soup.find("a", attrs={"id": "linkNext"})
        if next_url:
            return threads, urljoin(r.url, next_url.get("href"))
        return threads, None
    @staticmethod
    def parse_thread(r):
        soup = BeautifulSoup(r.text, "html.parser")
        op_el = soup.find("div", attrs={"class": "innerOP"})
        if not op_el:
            return []
        yield {
            "id": int(soup.find("div", class_="opCell").get("id")),
            "type": "thread",
            "html": str(op_el),
            "time": int(datetime.datetime.strptime(op_el.find("span", class_="labelCreated").text,
                                                   "%m/%d/%Y (%a) %H:%M:%S").timestamp())
        }
        for post_el in soup.find_all("div", class_="postCell"):
            yield {
                "id": int(post_el.get("id")),
                "type": "post",
                "html": str(post_el),
                "time": int(datetime.datetime.strptime(post_el.find("span", class_="labelCreated").text,
                                                       "%m/%d/%Y (%a) %H:%M:%S").timestamp())
            }
--- a/chan/helper.py
+++ b/chan/helper.py
@ -0,0 +1,75 @@
 from bs4 import BeautifulSoup
 class ChanHelper:
    def __init__(self, db_id, base_url, image_url, thread_path, image_path, boards, rps):
        self.db_id = db_id
        self._base_url = base_url
        self._image_url = image_url
        self._thread_path = thread_path
        self._image_path = image_path
        self._boards = boards
        self.rps = rps
    def boards(self):
        return [b for b in self._boards if not b.startswith("_")]
    def image_url(self, board, tim, extension):
        return "%s%s%s%s%s" % (self._image_url, board, self._image_path, tim, extension)
    def threads_url(self, board):
        return "%s%s/threads.json" % (self._base_url, board)
    def posts_url(self, board, thread):
        return "%s%s%s%d.json" % (self._base_url, board, self._thread_path, thread)
    def board_hash(self, board):
        return str((self._boards.index(board) + 1) * 10000)
    @staticmethod
    def item_id(item):
        raise NotImplementedError
    @staticmethod
    def item_mtime(item):
        raise NotImplementedError
    def item_unique_id(self, item, board):
        return int(self.board_hash(board) + str(self.item_id(item)))
    @staticmethod
    def thread_mtime(thread):
        raise NotImplementedError
    def item_urls(self, item, board):
        raise NotImplementedError
    @staticmethod
    def item_type(item):
        raise NotImplementedError
    @staticmethod
    def parse_threads_list(r):
        raise NotImplementedError
    @staticmethod
    def parse_thread(r):
        raise NotImplementedError
    @staticmethod
    def parse_thread(r):
        soup = BeautifulSoup(r.text, "html.parser")
        op_el = soup.find("div", attrs={"class": "innerOP"})
        yield {
            "id": int(soup.find("div", class_="opCell").get("id")),
            "type": "thread",
            "html": str(op_el),
        }
        for post_el in soup.find_all("div", class_="postCell"):
            yield {
                "id": int(post_el.get("id")),
                "type": "post",
                "html": str(post_el),
            }
--- a/chan/json.py
+++ b/chan/json.py
@ -0,0 +1,60 @@
 import json
 from json import JSONDecodeError
 from chan.helper import ChanHelper
 from post_process import get_links_from_body
 from util import logger
 class JsonChanHelper(ChanHelper):
    @staticmethod
    def item_id(item):
        return item["no"]
    @staticmethod
    def item_mtime(item):
        return item["time"]
    def item_urls(self, item, board):
        urls = set()
        if "com" in item and item["com"]:
            urls.update(get_links_from_body(item["com"]))
        elif "sub" in item and item["sub"]:
            urls.update(get_links_from_body(item["sub"]))
        if "fsize" in item and item["fsize"]:
            urls.add(self.image_url(board, item["tim"], item["ext"]))
        return list(urls)
    @staticmethod
    def item_type(item):
        return "thread" if "sub" in item else "post"
    @staticmethod
    def thread_mtime(thread):
        return thread["last_modified"]
    @staticmethod
    def parse_threads_list(r):
        try:
            j = json.loads(r.text)
            if len(j) == 0 or "threads" not in j[0]:
                logger.warning("No threads in response for %s: %s" % (r.url, r.text,))
                return [], None
        except JSONDecodeError:
            logger.warning("JSONDecodeError for %s:" % (r.url,))
            logger.warning(r.text)
            return [], None
        threads = []
        for page in j:
            for thread in page["threads"]:
                threads.append(thread)
        return threads, None
    @staticmethod
    def parse_thread(r):
        j = json.loads(r.text)
        return j["posts"]
--- a/chan/mayuri.py
+++ b/chan/mayuri.py
@ -0,0 +1,75 @@
 import json
 from json import JSONDecodeError
 from chan.helper import ChanHelper
 from post_process import get_links_from_body
 from util import logger
 class MayuriChanHelper(ChanHelper):
    def __init__(self, db_id, base_url, image_url, boards, rps):
        super().__init__(db_id, base_url, image_url, None, None, boards, rps)
    @staticmethod
    def item_id(item):
        return item["id"]
    @staticmethod
    def item_mtime(item):
        return item["timestamp"]
    @staticmethod
    def thread_mtime(thread):
        return thread["replies_count"]
    def item_urls(self, item, board):
        urls = set()
        if "message" in item and item["message"]:
            urls.update(get_links_from_body(item["message"]))
        elif "subject" in item and item["subject"]:
            urls.update(get_links_from_body(item["subject"]))
        if item["files"]:
            for file in item["files"]:
                urls.add(self._image_url % file["storage"] + file["name"] + "." + file["ext"])
        return list(urls)
    @staticmethod
    def item_type(item):
        return "thread" if "replies_count" in item else "post"
    def parse_threads_list(self, r):
        try:
            j = json.loads(r.text)
        except JSONDecodeError:
            logger.warning("JSONDecodeError for %s:" % (r.url,))
            logger.warning(r.text)
            return [], None
        if j["currentPage"] < j["totalPages"]:
            return j["data"], self._base_url + "boards/%d" % (j["currentPage"] + 1,)
        return j["data"]
    @staticmethod
    def parse_thread(r):
        try:
            j = json.loads(r.text)
        except JSONDecodeError:
            logger.warning("JSONDecodeError for %s:" % (r.url,))
            logger.warning(r.text)
            return []
        thread = dict(j["data"])
        del thread["replies"]
        yield thread
        if j["data"]["replies"]:
            for post in j["data"]["replies"]:
                yield post
    def threads_url(self, board):
        return "%sboards/1" % (self._base_url,)
    def posts_url(self, board, thread):
        return "%sthreads/%d" % (self._base_url, thread)
--- a/chan/russian_json.py
+++ b/chan/russian_json.py
@ -0,0 +1,55 @@
 import json
 from json import JSONDecodeError
 from chan.helper import ChanHelper
 from post_process import get_links_from_body
 from util import logger
 class RussianJsonChanHelper(ChanHelper):
    @staticmethod
    def item_id(item):
        return int(item["num"])
    @staticmethod
    def item_mtime(item):
        return item["timestamp"]
    @staticmethod
    def parse_threads_list(r):
        try:
            j = json.loads(r.text)
        except JSONDecodeError:
            logger.warning("JSONDecodeError for %s:" % (r.url,))
            logger.warning(r.text)
            return [], None
        return j["threads"], None
    @staticmethod
    def parse_thread(r):
        j = json.loads(r.text)
        for thread in j["threads"]:
            for post in thread["posts"]:
                yield post
    @staticmethod
    def thread_mtime(thread):
        return thread["posts_count"]
    @staticmethod
    def item_type(item):
        return "thread" if "subject" in item and item["subject"] != "" else "post"
    def item_urls(self, item, board):
        urls = set()
        if "comment" in item and item["comment"]:
            urls.update(get_links_from_body(item["comment"]))
        elif "subject" in item and item["subject"]:
            urls.update(get_links_from_body(item["subject"]))
        for file in item["files"]:
            urls.add(self._base_url.rstrip("/") + file["path"])
        return list(urls)
--- a/chan/synch_json.py
+++ b/chan/synch_json.py
@ -0,0 +1,18 @@
 from chan.json import JsonChanHelper
 from post_process import get_links_from_body
 class SynchJsonChanHelper(JsonChanHelper):
    def item_urls(self, item, board):
        urls = set()
        if "com" in item and item["com"]:
            urls.update(get_links_from_body(item["com"]))
        elif "sub" in item and item["sub"]:
            urls.update(get_links_from_body(item["sub"]))
        if "fsize" in item and item["fsize"]:
            urls.add(self._image_url + self._image_path + item["tim"] + item["ext"])
        return list(urls)
--- a/post_process.py
+++ b/post_process.py
@ -3,6 +3,7 @@ import hashlib
 import re
 import zlib
 from io import BytesIO
 from urllib.parse import urljoin
 import imagehash
 from PIL import Image
@ -10,6 +11,7 @@ from PIL import Image
 from util import logger
 LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))")
 HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"")
 IMAGE_FILETYPES = (
    # :orig for twitter cdn
@ -71,7 +73,7 @@ def image_meta(url, url_idx, web):
 def post_process(item, board, helper, web):
-    item["_v"] = 1.4
+    item["_v"] = 1.5
    item["_id"] = helper.item_unique_id(item, board)
    item["_board"] = board
@ -100,5 +102,13 @@ def get_links_from_body(body):
    return result
 def get_links_from_html_body(body, base_url):
    result = []
    for match in HTML_HREF_RE.finditer(body):
        url = match.group(1)
        result.append(urljoin(base_url, url))
    return result
 def is_external(url):
    return not url.startswith(("#", "/"))
--- a/run.py
+++ b/run.py
@ -10,7 +10,7 @@ from threading import Thread
 import pika
 import monitoring
-from chan import CHANS
+from chan.chan import CHANS
 from post_process import post_process
 from util import logger, Web