refactor chan.py, update endchan, add doushio

2025-12-10 20:08:49 +00:00 · 2019-09-08 14:42:15 -04:00
parent 175d0bebaa
commit 18247d4139
15 changed files with 679 additions and 530 deletions
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@ image boards and publishes serialised JSON to RabbitMQ
 
 Compatible image boards: 4chan, lainchan, uboachan,
 22chan, wizchan, 1chan, 2ch.hk, endchan, 38chan, alokal,
-horochan.
+horochan, doushio.

 Can optionally push monitoring data to InfluxDB. Below is an
 example of Grafana being used to display it.
--- a/init.py
+++ b/init.py
--- a/chan.py
+++ b/chan.py
@@ -1,527 +0,0 @@
-import json
-from json import JSONDecodeError
-from urllib.parse import urljoin
-
-from bs4 import BeautifulSoup
-
-from post_process import get_links_from_body
-from util import logger
-
-
-class ChanHelper:
-    def __init__(self, db_id, base_url, image_url, thread_path, image_path, boards, rps):
-        self.db_id = db_id
-        self._base_url = base_url
-        self._image_url = image_url
-        self._thread_path = thread_path
-        self._image_path = image_path
-        self._boards = boards
-        self.rps = rps
-
-    def boards(self):
-        return [b for b in self._boards if not b.startswith("_")]
-
-    def image_url(self, board, tim, extension):
-        return "%s%s%s%s%s" % (self._image_url, board, self._image_path, tim, extension)
-
-    def threads_url(self, board):
-        return "%s%s/threads.json" % (self._base_url, board)
-
-    def posts_url(self, board, thread):
-        return "%s%s%s%d.json" % (self._base_url, board, self._thread_path, thread)
-
-    def board_hash(self, board):
-        return str((self._boards.index(board) + 1) * 10000)
-
-    @staticmethod
-    def item_id(item):
-        raise NotImplementedError
-
-    @staticmethod
-    def item_mtime(item):
-        raise NotImplementedError
-
-    def item_unique_id(self, item, board):
-        return int(self.board_hash(board) + str(self.item_id(item)))
-
-    @staticmethod
-    def thread_mtime(thread):
-        raise NotImplementedError
-
-    def item_urls(self, item, board):
-        raise NotImplementedError
-
-    @staticmethod
-    def item_type(item):
-        raise NotImplementedError
-
-    @staticmethod
-    def parse_threads_list(r):
-        raise NotImplementedError
-
-    @staticmethod
-    def parse_thread(r):
-        raise NotImplementedError
-
-
-class HtmlChanHelper(ChanHelper):
-
-    def threads_url(self, board):
-        return "%s%s/" % (self._base_url, board)
-
-    def posts_url(self, board, thread):
-        return "%s%s%s%d.html" % (self._base_url, board, self._thread_path, thread)
-
-    @staticmethod
-    def item_id(item):
-        return item["id"]
-
-    def item_urls(self, item, board):
-        return []
-
-    @staticmethod
-    def item_type(item):
-        return item["type"]
-
-    @staticmethod
-    def thread_mtime(thread):
-        return -1
-
-    @staticmethod
-    def item_mtime(item):
-        return 0  # TODO
-
-    def parse_threads_list(self, r):
-        soup = BeautifulSoup(r.text, "html.parser")
-
-        threads = []
-
-        for threadEl in soup.find_all("div", attrs={"class": "opCell"}):
-            threads.append({
-                "id": int(threadEl.get("id")),
-            })
-
-        next_url = soup.find("a", attrs={"id": "linkNext"})
-        if next_url:
-            return threads, urljoin(r.url, next_url.get("href"))
-        return threads, None
-
-    @staticmethod
-    def parse_thread(r):
-        soup = BeautifulSoup(r.text, "html.parser")
-
-        op_el = soup.find("div", attrs={"class": "innerOP"})
-        yield {
-            "id": int(soup.find("div", class_="opCell").get("id")),
-            "type": "thread",
-            "html": str(op_el),
-        }
-
-        for post_el in soup.find_all("div", class_="postCell"):
-            yield {
-                "id": int(post_el.get("id")),
-                "type": "post",
-                "html": str(post_el),
-            }
-
-
-class JsonChanHelper(ChanHelper):
-
-    @staticmethod
-    def item_id(item):
-        return item["no"]
-
-    @staticmethod
-    def item_mtime(item):
-        return item["time"]
-
-    def item_urls(self, item, board):
-        urls = set()
-
-        if "com" in item and item["com"]:
-            urls.update(get_links_from_body(item["com"]))
-        elif "sub" in item and item["sub"]:
-            urls.update(get_links_from_body(item["sub"]))
-        if "fsize" in item and item["fsize"]:
-            urls.add(self.image_url(board, item["tim"], item["ext"]))
-
-        return list(urls)
-
-    @staticmethod
-    def item_type(item):
-        return "thread" if "sub" in item else "post"
-
-    @staticmethod
-    def thread_mtime(thread):
-        return thread["last_modified"]
-
-    @staticmethod
-    def parse_threads_list(r):
-        try:
-            j = json.loads(r.text)
-            if len(j) == 0 or "threads" not in j[0]:
-                logger.warning("No threads in response for %s: %s" % (r.url, r.text,))
-                return [], None
-        except JSONDecodeError:
-            logger.warning("JSONDecodeError for %s:" % (r.url,))
-            logger.warning(r.text)
-            return [], None
-
-        threads = []
-        for page in j:
-            for thread in page["threads"]:
-                threads.append(thread)
-        return threads, None
-
-    @staticmethod
-    def parse_thread(r):
-        j = json.loads(r.text)
-        return j["posts"]
-
-
-class RussianJsonChanHelper(ChanHelper):
-
-    @staticmethod
-    def item_id(item):
-        return int(item["num"])
-
-    @staticmethod
-    def item_mtime(item):
-        return item["timestamp"]
-
-    @staticmethod
-    def parse_threads_list(r):
-        try:
-            j = json.loads(r.text)
-        except JSONDecodeError:
-            logger.warning("JSONDecodeError for %s:" % (r.url,))
-            logger.warning(r.text)
-            return [], None
-        return j["threads"], None
-
-    @staticmethod
-    def parse_thread(r):
-        j = json.loads(r.text)
-        for thread in j["threads"]:
-            for post in thread["posts"]:
-                yield post
-
-    @staticmethod
-    def thread_mtime(thread):
-        return thread["posts_count"]
-
-    @staticmethod
-    def item_type(item):
-        return "thread" if "subject" in item and item["subject"] != "" else "post"
-
-    def item_urls(self, item, board):
-        urls = set()
-
-        if "comment" in item and item["comment"]:
-            urls.update(get_links_from_body(item["comment"]))
-        elif "subject" in item and item["subject"]:
-            urls.update(get_links_from_body(item["subject"]))
-
-        for file in item["files"]:
-            urls.add(self._base_url.rstrip("/") + file["path"])
-
-        return list(urls)
-
-
-class AlokalJsonChanHelper(JsonChanHelper):
-
-    def item_urls(self, item, board):
-        urls = set()
-
-        if "com" in item and item["com"]:
-            urls.update(get_links_from_body(item["com"]))
-        elif "sub" in item and item["sub"]:
-            urls.update(get_links_from_body(item["sub"]))
-        if "fsize" in item and item["fsize"]:
-            urls.add(self._image_url + self._image_path + item["tim"] + "/" + str(item["no"]) + item["ext"])
-
-        return list(urls)
-
-
-class SynchJsonChanHelper(JsonChanHelper):
-
-    def item_urls(self, item, board):
-        urls = set()
-
-        if "com" in item and item["com"]:
-            urls.update(get_links_from_body(item["com"]))
-        elif "sub" in item and item["sub"]:
-            urls.update(get_links_from_body(item["sub"]))
-        if "fsize" in item and item["fsize"]:
-            urls.add(self._image_url + self._image_path + item["tim"] + item["ext"])
-
-        return list(urls)
-
-
-class MayuriChanHelper(ChanHelper):
-
-    def __init__(self, db_id, base_url, image_url, boards, rps):
-        super().__init__(db_id, base_url, image_url, None, None, boards, rps)
-
-    @staticmethod
-    def item_id(item):
-        return item["id"]
-
-    @staticmethod
-    def item_mtime(item):
-        return item["timestamp"]
-
-    @staticmethod
-    def thread_mtime(thread):
-        return thread["replies_count"]
-
-    def item_urls(self, item, board):
-        urls = set()
-
-        if "message" in item and item["message"]:
-            urls.update(get_links_from_body(item["message"]))
-        elif "subject" in item and item["subject"]:
-            urls.update(get_links_from_body(item["subject"]))
-        if item["files"]:
-            for file in item["files"]:
-                urls.add(self._image_url % file["storage"] + file["name"] + "." + file["ext"])
-
-        return list(urls)
-
-    @staticmethod
-    def item_type(item):
-        return "thread" if "replies_count" in item else "post"
-
-    def parse_threads_list(self, r):
-        try:
-            j = json.loads(r.text)
-        except JSONDecodeError:
-            logger.warning("JSONDecodeError for %s:" % (r.url,))
-            logger.warning(r.text)
-            return [], None
-        if j["currentPage"] < j["totalPages"]:
-            return j["data"], self._base_url + "boards/%d" % (j["currentPage"] + 1, )
-        return j["data"]
-
-    @staticmethod
-    def parse_thread(r):
-        try:
-            j = json.loads(r.text)
-        except JSONDecodeError:
-            logger.warning("JSONDecodeError for %s:" % (r.url,))
-            logger.warning(r.text)
-            return []
-
-        thread = dict(j["data"])
-        del thread["replies"]
-        yield thread
-
-        if j["data"]["replies"]:
-            for post in j["data"]["replies"]:
-                yield post
-
-    def threads_url(self, board):
-        return "%sboards/1" % (self._base_url, )
-
-    def posts_url(self, board, thread):
-        return "%sthreads/%d" % (self._base_url, thread)
-
-
-CHANS = {
-    "4chan": JsonChanHelper(
-        1,
-        "https://a.4cdn.org/",
-        "https://i.4cdn.org/",
-        "/thread/",
-        "/",
-        (
-            "a", "b", "c", "d", "e", "f", "g", "gif", "h", "hr",
-            "k", "m", "o", "p", "r", "s", "t", "u", "v", "vg",
-            "vr", "w", "wg", "i", "ic", "r9k", "s4s", "vip", "qa",
-            "cm", "hm", "lgbt", "y", "3", "aco", "adv", "an", "asp",
-            "bant", "biz", "cgl", "ck", "co", "diy", "fa", "fit",
-            "gd", "hc", "his", "int", "jp", "lit", "mlp", "mu", "n",
-            "news", "out", "po", "pol", "qst", "sci", "soc", "sp",
-            "tg", "toy", "trv", "tv", "vp", "wsg", "wsr", "x"
-        ),
-        rps=2
-    ),
-    "lainchan": JsonChanHelper(
-        2,
-        "https://lainchan.org/",
-        "https://lainchan.org/",
-        "/res/",
-        "/src/",
-        (
-            "λ", "diy", "sec", "tech", "inter", "lit", "music", "vis",
-            "hum", "drg", "zzz", "layer", "q", "r", "_cult", "_psy",
-            "_mega",
-        ),
-        rps=1 / 60
-    ),
-    "uboachan": JsonChanHelper(
-        3,
-        "https://uboachan.net/",
-        "https://uboachan.net/",
-        "/res/",
-        "/src/",
-        (
-            "yn", "yndd", "fg", "yume", "o", "lit", "media", "og",
-            "ig", "2", "ot", "hikki", "cc", "x", "sugg"
-        ),
-        rps=1 / 120
-    ),
-    "22chan": JsonChanHelper(
-        4,
-        "https://22chan.org/",
-        "https://22chan.org/",
-        "/res/",
-        "/src/",
-        (
-            "a", "b", "f", "yu", "i", "k", "mu", "pol", "sewers",
-            "sg", "t", "vg"
-        ),
-        rps=1 / 120
-    ),
-    "wizchan": JsonChanHelper(
-        5,
-        "https://wizchan.org/",
-        "https://wizchan.org/",
-        "/res/",
-        "/src/",
-        (
-            "wiz", "dep", "hob", "lounge", "jp", "meta", "games", "music",
-        ),
-        rps=1 / 30
-    ),
-    # TODO
-    "1chan": ChanHelper(
-        6,
-        "https://www.1chan.net/",
-        "https://www.1chan.net/",
-        "/res/",
-        "/src/",
-        (
-            "rails"
-        ),
-        rps=1 / 600
-    ),
-    "2chhk": RussianJsonChanHelper(
-        7,
-        "https://2ch.hk/",
-        "https://2ch.hk/",
-        "/res/",
-        "/src/",
-        (
-            "d", "b", "o", "soc", "media", "r", "api", "rf", "int",
-            "po", "news", "hry", "au", "bi", "biz", "bo", "c", "em",
-            "fa", "fiz", "fl", "ftb", "hh", "hi", "me", "mg", "mlp",
-            "mo", "mov", "mu", "ne", "psy", "re",
-            "sci", "sf", "sn", "sp", "spc", "tv", "un", "w", "wh",
-            "wm", "wp", "zog", "de", "di", "diy", "mus", "pa", "p",
-            "wrk", "trv", "gd", "hw", "mobi", "pr", "ra", "s", "t",
-            "web", "bg", "cg", "gsg", "ruvn", "tes", "v", "vg", "wr",
-            "a", "fd", "ja", "ma", "vn", "fg", "fur", "gg", "ga",
-            "vape", "h", "ho", "hc", "e", "fet", "sex", "fag"
-        ),
-        rps=1
-    ),
-    "endchan": HtmlChanHelper(
-        8,
-        "https://endchan.net/",
-        "https://endchan.net/",
-        "/res/",
-        "/.media/",
-        (
-            "art", "film", "oekaki", "draw",
-            "adv", "r9k", "hope", "spoon",
-            "a", "am", "amr", "l", "monster", "m", "2hu", "animach",
-            "b", "webm", "v", "vvv", "vidya", "tg", "otomad", "mu",
-            "metal", "tv", "f", "clipuploads",
-            "4", "deutsch", "j", "jp" "italia", "fr", "kc", "kurenai", "int",
-            "intl", "lang", "librejp", "rzabczan", "55chan",
-            "pol", "pdfs", "his", "ggrevols", "horror", "aethism",
-            "tech", "g", "markov", "os", "agdg", "cyber", "HTML", "2600",
-            "ausneets", "qanonresearch", "polru", "yuri", "christianity",
-            "kc", "rapport", "news", "brit", "webm", "4chon"
-        ),
-        rps=1
-    ),
-    "38chan": JsonChanHelper(
-        9,
-        "http://38chan.net/",
-        "http://38chan.net/",
-        "/res/",
-        "/src/",
-        (
-            "a", "b", "g", "38"
-        ),
-        rps=1 / 600
-    ),
-    "alokal": AlokalJsonChanHelper(
-        10,
-        "https://alokal.eu/",
-        "https://alokal.eu/",
-        "/",
-        "src/",
-        (
-            "b", "pol", "sk", "int", "slav", "s", "gv", "mda", "sp",
-            "fit", "had",
-        ),
-        rps=1 / 4
-    ),
-    "gnfos": JsonChanHelper(
-        11,
-        "https://gnfos.com/",
-        "https://gnfos.com/",
-        "/res/",
-        "/src/",
-        (
-            "jp", "drive"
-        ),
-        rps=1 / 60
-    ),
-    "synch": SynchJsonChanHelper(
-        12,
-        "https://syn-ch.ru/",
-        "https://cdn.syn-ch.ru/",
-        "/res/",
-        "src",
-        (
-            "b", "d", "_r", "a", "_g", "mlp", "mu", "_tv", "vg",
-            "_wh", "old", "test"
-        ),
-        rps=1 / 120
-    ),
-    "tahta": JsonChanHelper(
-        13,
-        "https://tahta.ch/",
-        "https://tahta.ch/",
-        "/res/",
-        "/src/",
-        (
-            "b", "g", "s", "v"
-        ),
-        rps=1 / 300
-    ),
-    "awsumchan": JsonChanHelper(
-        14,
-        "https://awsumchan.org/",
-        "https://awsumchan.org/",
-        "/res/",
-        "/src/",
-        (
-            "an", "aw", "cr", "fi", "ra", "au", "ga", "he", "sp"
-        ),
-        rps=1 / 600
-    ),
-    "horochan": MayuriChanHelper(
-        15,
-        "https://api.horochan.ru/v1/",
-        "https://%s.horochan.ru/src/",
-        (
-            "b"
-        ),
-        rps=1/20
-    ),
-}
--- a/chan/init.py
+++ b/chan/init.py
--- a/chan/alokal_json.py
+++ b/chan/alokal_json.py
@@ -0,0 +1,17 @@
+from chan.json import JsonChanHelper
+from post_process import get_links_from_body
+
+
+class AlokalJsonChanHelper(JsonChanHelper):
+
+    def item_urls(self, item, board):
+        urls = set()
+
+        if "com" in item and item["com"]:
+            urls.update(get_links_from_body(item["com"]))
+        elif "sub" in item and item["sub"]:
+            urls.update(get_links_from_body(item["sub"]))
+        if "fsize" in item and item["fsize"]:
+            urls.add(self._image_url + self._image_path + item["tim"] + "/" + str(item["no"]) + item["ext"])
+
+        return list(urls)
--- a/chan/chan.py
+++ b/chan/chan.py
@@ -0,0 +1,217 @@
+from chan.alokal_json import AlokalJsonChanHelper
+from chan.doushio_html import DoushioHtmlChanHelper
+from chan.endchan_html import EndchanHtmlChanHelper
+from chan.json import JsonChanHelper
+from chan.mayuri import MayuriChanHelper
+from chan.russian_json import RussianJsonChanHelper
+from chan.synch_json import SynchJsonChanHelper
+
+CHANS = {
+    "4chan": JsonChanHelper(
+        1,
+        "https://a.4cdn.org/",
+        "https://i.4cdn.org/",
+        "/thread/",
+        "/",
+        (
+            "a", "b", "c", "d", "e", "f", "g", "gif", "h", "hr",
+            "k", "m", "o", "p", "r", "s", "t", "u", "v", "vg",
+            "vr", "w", "wg", "i", "ic", "r9k", "s4s", "vip", "qa",
+            "cm", "hm", "lgbt", "y", "3", "aco", "adv", "an", "asp",
+            "bant", "biz", "cgl", "ck", "co", "diy", "fa", "fit",
+            "gd", "hc", "his", "int", "jp", "lit", "mlp", "mu", "n",
+            "news", "out", "po", "pol", "qst", "sci", "soc", "sp",
+            "tg", "toy", "trv", "tv", "vp", "wsg", "wsr", "x"
+        ),
+        rps=2
+    ),
+    "lainchan": JsonChanHelper(
+        2,
+        "https://lainchan.org/",
+        "https://lainchan.org/",
+        "/res/",
+        "/src/",
+        (
+            "λ", "diy", "sec", "tech", "inter", "lit", "music", "vis",
+            "hum", "drg", "zzz", "layer", "q", "r", "_cult", "_psy",
+            "_mega",
+        ),
+        rps=1 / 60
+    ),
+    "uboachan": JsonChanHelper(
+        3,
+        "https://uboachan.net/",
+        "https://uboachan.net/",
+        "/res/",
+        "/src/",
+        (
+            "yn", "yndd", "fg", "yume", "o", "lit", "media", "og",
+            "ig", "2", "ot", "hikki", "cc", "x", "sugg"
+        ),
+        rps=1 / 120
+    ),
+    "22chan": JsonChanHelper(
+        4,
+        "https://22chan.org/",
+        "https://22chan.org/",
+        "/res/",
+        "/src/",
+        (
+            "a", "b", "f", "yu", "i", "k", "mu", "pol", "sewers",
+            "sg", "t", "vg"
+        ),
+        rps=1 / 120
+    ),
+    "wizchan": JsonChanHelper(
+        5,
+        "https://wizchan.org/",
+        "https://wizchan.org/",
+        "/res/",
+        "/src/",
+        (
+            "wiz", "dep", "hob", "lounge", "jp", "meta", "games", "music",
+        ),
+        rps=1 / 30
+    ),
+    # TODO
+    # "1chan": ChanHelper(
+    #     6,
+    #     "https://www.1chan.net/",
+    #     "https://www.1chan.net/",
+    #     "/res/",
+    #     "/src/",
+    #     (
+    #         "rails"
+    #     ),
+    #     rps=1 / 600
+    # ),
+    "2chhk": RussianJsonChanHelper(
+        7,
+        "https://2ch.hk/",
+        "https://2ch.hk/",
+        "/res/",
+        "/src/",
+        (
+            "d", "b", "o", "soc", "media", "r", "api", "rf", "int",
+            "po", "news", "hry", "au", "bi", "biz", "bo", "c", "em",
+            "fa", "fiz", "fl", "ftb", "hh", "hi", "me", "mg", "mlp",
+            "mo", "mov", "mu", "ne", "psy", "re",
+            "sci", "sf", "sn", "sp", "spc", "tv", "un", "w", "wh",
+            "wm", "wp", "zog", "de", "di", "diy", "mus", "pa", "p",
+            "wrk", "trv", "gd", "hw", "mobi", "pr", "ra", "s", "t",
+            "web", "bg", "cg", "gsg", "ruvn", "tes", "v", "vg", "wr",
+            "a", "fd", "ja", "ma", "vn", "fg", "fur", "gg", "ga",
+            "vape", "h", "ho", "hc", "e", "fet", "sex", "fag"
+        ),
+        rps=1
+    ),
+    "endchan": EndchanHtmlChanHelper(
+        8,
+        "https://endchan.net/",
+        "https://endchan.net/",
+        "/res/",
+        "/.media/",
+        (
+            "art", "film", "oekaki", "draw",
+            "adv", "r9k", "hope", "spoon",
+            "a", "am", "amr", "l", "monster", "m", "2hu", "animach",
+            "b", "webm", "v", "vvv", "vidya", "tg", "otomad", "mu",
+            "metal", "tv", "f", "clipuploads",
+            "4", "deutsch", "j", "jp" "italia", "fr", "kc", "kurenai", "int",
+            "intl", "lang", "librejp", "rzabczan", "55chan",
+            "pol", "pdfs", "his", "ggrevols", "horror", "aethism",
+            "tech", "g", "markov", "os", "agdg", "cyber", "HTML", "2600",
+            "ausneets", "qanonresearch", "polru", "yuri", "christianity",
+            "kc", "rapport", "news", "brit", "webm", "4chon"
+        ),
+        rps=1
+    ),
+    "38chan": JsonChanHelper(
+        9,
+        "http://38chan.net/",
+        "http://38chan.net/",
+        "/res/",
+        "/src/",
+        (
+            "a", "b", "g", "38"
+        ),
+        rps=1 / 600
+    ),
+    "alokal": AlokalJsonChanHelper(
+        10,
+        "https://alokal.eu/",
+        "https://alokal.eu/",
+        "/",
+        "src/",
+        (
+            "b", "pol", "sk", "int", "slav", "s", "gv", "mda", "sp",
+            "fit", "had",
+        ),
+        rps=1 / 4
+    ),
+    "gnfos": JsonChanHelper(
+        11,
+        "https://gnfos.com/",
+        "https://gnfos.com/",
+        "/res/",
+        "/src/",
+        (
+            "jp", "drive"
+        ),
+        rps=1 / 60
+    ),
+    "synch": SynchJsonChanHelper(
+        12,
+        "https://syn-ch.ru/",
+        "https://cdn.syn-ch.ru/",
+        "/res/",
+        "src",
+        (
+            "b", "d", "_r", "a", "_g", "mlp", "mu", "_tv", "vg",
+            "_wh", "old", "test"
+        ),
+        rps=1 / 120
+    ),
+    "tahta": JsonChanHelper(
+        13,
+        "https://tahta.ch/",
+        "https://tahta.ch/",
+        "/res/",
+        "/src/",
+        (
+            "b", "g", "s", "v"
+        ),
+        rps=1 / 300
+    ),
+    "awsumchan": JsonChanHelper(
+        14,
+        "https://awsumchan.org/",
+        "https://awsumchan.org/",
+        "/res/",
+        "/src/",
+        (
+            "an", "aw", "cr", "fi", "ra", "au", "ga", "he", "sp"
+        ),
+        rps=1 / 600
+    ),
+    "horochan": MayuriChanHelper(
+        15,
+        "https://api.horochan.ru/v1/",
+        "https://%s.horochan.ru/src/",
+        (
+            "b",
+        ),
+        rps=1 / 20
+    ),
+    "doushio": DoushioHtmlChanHelper(
+        16,
+        "http://doushio.com/",
+        "http://doushio.com/",
+        "",
+        "/ass/",
+        (
+            "moe",
+        ),
+        rps=1
+    )
+}
--- a/chan/doushio_html.py
+++ b/chan/doushio_html.py
@@ -0,0 +1,75 @@
+from urllib.parse import urljoin
+from dateutil import parser
+
+from bs4 import BeautifulSoup
+
+from chan.helper import ChanHelper
+from post_process import get_links_from_html_body
+
+
+class DoushioHtmlChanHelper(ChanHelper):
+
+    def threads_url(self, board):
+        return "%s%s/" % (self._base_url, board)
+
+    def posts_url(self, board, thread):
+        return "%s%s/%d" % (self._base_url, board, thread)
+
+    @staticmethod
+    def item_id(item):
+        return item["id"]
+
+    def item_urls(self, item, board):
+        return list(set(get_links_from_html_body(item["html"], self._base_url)))
+
+    @staticmethod
+    def item_type(item):
+        return item["type"]
+
+    @staticmethod
+    def thread_mtime(thread):
+        return -1
+
+    @staticmethod
+    def item_mtime(item):
+        return item["time"]
+
+    def parse_threads_list(self, r):
+        soup = BeautifulSoup(r.text, "html.parser")
+
+        threads = []
+
+        for threadEl in soup.find_all("section"):
+            threads.append({
+                "id": int(threadEl.get("id")),
+            })
+
+        next_url = soup.find("link", attrs={"rel": "next"})
+        if next_url:
+            return threads, urljoin(r.url, next_url.get("href"))
+        return threads, None
+
+    @staticmethod
+    def parse_thread(r):
+        soup = BeautifulSoup(r.text, "html.parser")
+
+        op_el = soup.find("section")
+        for post_el in op_el.find_all("article"):
+            yield {
+                "id": int(post_el.get("id")),
+                "type": "post",
+                "html": str(post_el),
+                "time": int(parser.parse(post_el.find("header").find("time").get("datetime")).timestamp())
+            }
+            post_el.decompose()
+        yield {
+            "id": int(op_el.get("id")),
+            "type": "thread",
+            "html": str(op_el),
+            "time": int(parser.parse(op_el.find("header").find("time").get("datetime")).timestamp())
+        }
+
+
+
+
+
--- a/chan/endchan_html.py
+++ b/chan/endchan_html.py
@@ -0,0 +1,74 @@
+import datetime
+from urllib.parse import urljoin
+
+from bs4 import BeautifulSoup
+
+from chan.helper import ChanHelper
+from post_process import get_links_from_html_body
+
+
+class EndchanHtmlChanHelper(ChanHelper):
+
+    def threads_url(self, board):
+        return "%s%s/" % (self._base_url, board)
+
+    def posts_url(self, board, thread):
+        return "%s%s%s%d.html" % (self._base_url, board, self._thread_path, thread)
+
+    @staticmethod
+    def item_id(item):
+        return item["id"]
+
+    def item_urls(self, item, board):
+        return list(set(get_links_from_html_body(item["html"], self._base_url)))
+
+    @staticmethod
+    def item_type(item):
+        return item["type"]
+
+    @staticmethod
+    def thread_mtime(thread):
+        return -1
+
+    @staticmethod
+    def item_mtime(item):
+        return item["time"]
+
+    def parse_threads_list(self, r):
+        soup = BeautifulSoup(r.text, "html.parser")
+
+        threads = []
+
+        for threadEl in soup.find_all("div", attrs={"class": "opCell"}):
+            threads.append({
+                "id": int(threadEl.get("id")),
+            })
+
+        next_url = soup.find("a", attrs={"id": "linkNext"})
+        if next_url:
+            return threads, urljoin(r.url, next_url.get("href"))
+        return threads, None
+
+    @staticmethod
+    def parse_thread(r):
+        soup = BeautifulSoup(r.text, "html.parser")
+
+        op_el = soup.find("div", attrs={"class": "innerOP"})
+        if not op_el:
+            return []
+        yield {
+            "id": int(soup.find("div", class_="opCell").get("id")),
+            "type": "thread",
+            "html": str(op_el),
+            "time": int(datetime.datetime.strptime(op_el.find("span", class_="labelCreated").text,
+                                                   "%m/%d/%Y (%a) %H:%M:%S").timestamp())
+        }
+
+        for post_el in soup.find_all("div", class_="postCell"):
+            yield {
+                "id": int(post_el.get("id")),
+                "type": "post",
+                "html": str(post_el),
+                "time": int(datetime.datetime.strptime(post_el.find("span", class_="labelCreated").text,
+                                                       "%m/%d/%Y (%a) %H:%M:%S").timestamp())
+            }
--- a/chan/helper.py
+++ b/chan/helper.py
@@ -0,0 +1,75 @@
+from bs4 import BeautifulSoup
+
+
+class ChanHelper:
+    def __init__(self, db_id, base_url, image_url, thread_path, image_path, boards, rps):
+        self.db_id = db_id
+        self._base_url = base_url
+        self._image_url = image_url
+        self._thread_path = thread_path
+        self._image_path = image_path
+        self._boards = boards
+        self.rps = rps
+
+    def boards(self):
+        return [b for b in self._boards if not b.startswith("_")]
+
+    def image_url(self, board, tim, extension):
+        return "%s%s%s%s%s" % (self._image_url, board, self._image_path, tim, extension)
+
+    def threads_url(self, board):
+        return "%s%s/threads.json" % (self._base_url, board)
+
+    def posts_url(self, board, thread):
+        return "%s%s%s%d.json" % (self._base_url, board, self._thread_path, thread)
+
+    def board_hash(self, board):
+        return str((self._boards.index(board) + 1) * 10000)
+
+    @staticmethod
+    def item_id(item):
+        raise NotImplementedError
+
+    @staticmethod
+    def item_mtime(item):
+        raise NotImplementedError
+
+    def item_unique_id(self, item, board):
+        return int(self.board_hash(board) + str(self.item_id(item)))
+
+    @staticmethod
+    def thread_mtime(thread):
+        raise NotImplementedError
+
+    def item_urls(self, item, board):
+        raise NotImplementedError
+
+    @staticmethod
+    def item_type(item):
+        raise NotImplementedError
+
+    @staticmethod
+    def parse_threads_list(r):
+        raise NotImplementedError
+
+    @staticmethod
+    def parse_thread(r):
+        raise NotImplementedError
+
+    @staticmethod
+    def parse_thread(r):
+        soup = BeautifulSoup(r.text, "html.parser")
+
+        op_el = soup.find("div", attrs={"class": "innerOP"})
+        yield {
+            "id": int(soup.find("div", class_="opCell").get("id")),
+            "type": "thread",
+            "html": str(op_el),
+        }
+
+        for post_el in soup.find_all("div", class_="postCell"):
+            yield {
+                "id": int(post_el.get("id")),
+                "type": "post",
+                "html": str(post_el),
+            }
--- a/chan/json.py
+++ b/chan/json.py
@@ -0,0 +1,60 @@
+import json
+from json import JSONDecodeError
+
+from chan.helper import ChanHelper
+from post_process import get_links_from_body
+from util import logger
+
+
+class JsonChanHelper(ChanHelper):
+
+    @staticmethod
+    def item_id(item):
+        return item["no"]
+
+    @staticmethod
+    def item_mtime(item):
+        return item["time"]
+
+    def item_urls(self, item, board):
+        urls = set()
+
+        if "com" in item and item["com"]:
+            urls.update(get_links_from_body(item["com"]))
+        elif "sub" in item and item["sub"]:
+            urls.update(get_links_from_body(item["sub"]))
+        if "fsize" in item and item["fsize"]:
+            urls.add(self.image_url(board, item["tim"], item["ext"]))
+
+        return list(urls)
+
+    @staticmethod
+    def item_type(item):
+        return "thread" if "sub" in item else "post"
+
+    @staticmethod
+    def thread_mtime(thread):
+        return thread["last_modified"]
+
+    @staticmethod
+    def parse_threads_list(r):
+        try:
+            j = json.loads(r.text)
+            if len(j) == 0 or "threads" not in j[0]:
+                logger.warning("No threads in response for %s: %s" % (r.url, r.text,))
+                return [], None
+        except JSONDecodeError:
+            logger.warning("JSONDecodeError for %s:" % (r.url,))
+            logger.warning(r.text)
+            return [], None
+
+        threads = []
+        for page in j:
+            for thread in page["threads"]:
+                threads.append(thread)
+        return threads, None
+
+    @staticmethod
+    def parse_thread(r):
+        j = json.loads(r.text)
+        return j["posts"]
--- a/chan/mayuri.py
+++ b/chan/mayuri.py
@@ -0,0 +1,75 @@
+import json
+from json import JSONDecodeError
+
+from chan.helper import ChanHelper
+from post_process import get_links_from_body
+from util import logger
+
+
+class MayuriChanHelper(ChanHelper):
+
+    def __init__(self, db_id, base_url, image_url, boards, rps):
+        super().__init__(db_id, base_url, image_url, None, None, boards, rps)
+
+    @staticmethod
+    def item_id(item):
+        return item["id"]
+
+    @staticmethod
+    def item_mtime(item):
+        return item["timestamp"]
+
+    @staticmethod
+    def thread_mtime(thread):
+        return thread["replies_count"]
+
+    def item_urls(self, item, board):
+        urls = set()
+
+        if "message" in item and item["message"]:
+            urls.update(get_links_from_body(item["message"]))
+        elif "subject" in item and item["subject"]:
+            urls.update(get_links_from_body(item["subject"]))
+        if item["files"]:
+            for file in item["files"]:
+                urls.add(self._image_url % file["storage"] + file["name"] + "." + file["ext"])
+
+        return list(urls)
+
+    @staticmethod
+    def item_type(item):
+        return "thread" if "replies_count" in item else "post"
+
+    def parse_threads_list(self, r):
+        try:
+            j = json.loads(r.text)
+        except JSONDecodeError:
+            logger.warning("JSONDecodeError for %s:" % (r.url,))
+            logger.warning(r.text)
+            return [], None
+        if j["currentPage"] < j["totalPages"]:
+            return j["data"], self._base_url + "boards/%d" % (j["currentPage"] + 1,)
+        return j["data"]
+
+    @staticmethod
+    def parse_thread(r):
+        try:
+            j = json.loads(r.text)
+        except JSONDecodeError:
+            logger.warning("JSONDecodeError for %s:" % (r.url,))
+            logger.warning(r.text)
+            return []
+
+        thread = dict(j["data"])
+        del thread["replies"]
+        yield thread
+
+        if j["data"]["replies"]:
+            for post in j["data"]["replies"]:
+                yield post
+
+    def threads_url(self, board):
+        return "%sboards/1" % (self._base_url,)
+
+    def posts_url(self, board, thread):
+        return "%sthreads/%d" % (self._base_url, thread)
--- a/chan/russian_json.py
+++ b/chan/russian_json.py
@@ -0,0 +1,55 @@
+import json
+from json import JSONDecodeError
+
+from chan.helper import ChanHelper
+from post_process import get_links_from_body
+from util import logger
+
+
+class RussianJsonChanHelper(ChanHelper):
+
+    @staticmethod
+    def item_id(item):
+        return int(item["num"])
+
+    @staticmethod
+    def item_mtime(item):
+        return item["timestamp"]
+
+    @staticmethod
+    def parse_threads_list(r):
+        try:
+            j = json.loads(r.text)
+        except JSONDecodeError:
+            logger.warning("JSONDecodeError for %s:" % (r.url,))
+            logger.warning(r.text)
+            return [], None
+        return j["threads"], None
+
+    @staticmethod
+    def parse_thread(r):
+        j = json.loads(r.text)
+        for thread in j["threads"]:
+            for post in thread["posts"]:
+                yield post
+
+    @staticmethod
+    def thread_mtime(thread):
+        return thread["posts_count"]
+
+    @staticmethod
+    def item_type(item):
+        return "thread" if "subject" in item and item["subject"] != "" else "post"
+
+    def item_urls(self, item, board):
+        urls = set()
+
+        if "comment" in item and item["comment"]:
+            urls.update(get_links_from_body(item["comment"]))
+        elif "subject" in item and item["subject"]:
+            urls.update(get_links_from_body(item["subject"]))
+
+        for file in item["files"]:
+            urls.add(self._base_url.rstrip("/") + file["path"])
+
+        return list(urls)
--- a/chan/synch_json.py
+++ b/chan/synch_json.py
@@ -0,0 +1,18 @@
+from chan.json import JsonChanHelper
+from post_process import get_links_from_body
+
+
+class SynchJsonChanHelper(JsonChanHelper):
+
+    def item_urls(self, item, board):
+        urls = set()
+
+        if "com" in item and item["com"]:
+            urls.update(get_links_from_body(item["com"]))
+        elif "sub" in item and item["sub"]:
+            urls.update(get_links_from_body(item["sub"]))
+        if "fsize" in item and item["fsize"]:
+            urls.add(self._image_url + self._image_path + item["tim"] + item["ext"])
+
+        return list(urls)
+
--- a/post_process.py
+++ b/post_process.py
@@ -3,6 +3,7 @@ import hashlib
 import re
 import zlib
 from io import BytesIO
+from urllib.parse import urljoin

 import imagehash
 from PIL import Image
@@ -10,6 +11,7 @@ from PIL import Image
 from util import logger

 LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))")
+HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"")

 IMAGE_FILETYPES = (
    # :orig for twitter cdn
@@ -71,7 +73,7 @@ def image_meta(url, url_idx, web):


 def post_process(item, board, helper, web):
-    item["_v"] = 1.4
+    item["_v"] = 1.5
    item["_id"] = helper.item_unique_id(item, board)

    item["_board"] = board
@@ -100,5 +102,13 @@ def get_links_from_body(body):
    return result


+def get_links_from_html_body(body, base_url):
+    result = []
+    for match in HTML_HREF_RE.finditer(body):
+        url = match.group(1)
+        result.append(urljoin(base_url, url))
+    return result
+
+
 def is_external(url):
    return not url.startswith(("#", "/"))
--- a/run.py
+++ b/run.py
@@ -10,7 +10,7 @@ from threading import Thread
 import pika

 import monitoring
-from chan import CHANS
+from chan.chan import CHANS
 from post_process import post_process
 from util import logger, Web