diff --git a/README.md b/README.md index fbdc1a5..d964445 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ image boards and publishes serialised JSON to RabbitMQ Compatible image boards: 4chan, lainchan, uboachan, 22chan, wizchan, 1chan, 2ch.hk, endchan, 38chan, alokal, -horochan. +horochan, doushio. Can optionally push monitoring data to InfluxDB. Below is an example of Grafana being used to display it. diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/chan.py b/chan.py deleted file mode 100644 index ce7150b..0000000 --- a/chan.py +++ /dev/null @@ -1,527 +0,0 @@ -import json -from json import JSONDecodeError -from urllib.parse import urljoin - -from bs4 import BeautifulSoup - -from post_process import get_links_from_body -from util import logger - - -class ChanHelper: - def __init__(self, db_id, base_url, image_url, thread_path, image_path, boards, rps): - self.db_id = db_id - self._base_url = base_url - self._image_url = image_url - self._thread_path = thread_path - self._image_path = image_path - self._boards = boards - self.rps = rps - - def boards(self): - return [b for b in self._boards if not b.startswith("_")] - - def image_url(self, board, tim, extension): - return "%s%s%s%s%s" % (self._image_url, board, self._image_path, tim, extension) - - def threads_url(self, board): - return "%s%s/threads.json" % (self._base_url, board) - - def posts_url(self, board, thread): - return "%s%s%s%d.json" % (self._base_url, board, self._thread_path, thread) - - def board_hash(self, board): - return str((self._boards.index(board) + 1) * 10000) - - @staticmethod - def item_id(item): - raise NotImplementedError - - @staticmethod - def item_mtime(item): - raise NotImplementedError - - def item_unique_id(self, item, board): - return int(self.board_hash(board) + str(self.item_id(item))) - - @staticmethod - def thread_mtime(thread): - raise NotImplementedError - - def item_urls(self, item, board): - raise NotImplementedError - - @staticmethod - def item_type(item): - raise NotImplementedError - - @staticmethod - def parse_threads_list(r): - raise NotImplementedError - - @staticmethod - def parse_thread(r): - raise NotImplementedError - - -class HtmlChanHelper(ChanHelper): - - def threads_url(self, board): - return "%s%s/" % (self._base_url, board) - - def posts_url(self, board, thread): - return "%s%s%s%d.html" % (self._base_url, board, self._thread_path, thread) - - @staticmethod - def item_id(item): - return item["id"] - - def item_urls(self, item, board): - return [] - - @staticmethod - def item_type(item): - return item["type"] - - @staticmethod - def thread_mtime(thread): - return -1 - - @staticmethod - def item_mtime(item): - return 0 # TODO - - def parse_threads_list(self, r): - soup = BeautifulSoup(r.text, "html.parser") - - threads = [] - - for threadEl in soup.find_all("div", attrs={"class": "opCell"}): - threads.append({ - "id": int(threadEl.get("id")), - }) - - next_url = soup.find("a", attrs={"id": "linkNext"}) - if next_url: - return threads, urljoin(r.url, next_url.get("href")) - return threads, None - - @staticmethod - def parse_thread(r): - soup = BeautifulSoup(r.text, "html.parser") - - op_el = soup.find("div", attrs={"class": "innerOP"}) - yield { - "id": int(soup.find("div", class_="opCell").get("id")), - "type": "thread", - "html": str(op_el), - } - - for post_el in soup.find_all("div", class_="postCell"): - yield { - "id": int(post_el.get("id")), - "type": "post", - "html": str(post_el), - } - - -class JsonChanHelper(ChanHelper): - - @staticmethod - def item_id(item): - return item["no"] - - @staticmethod - def item_mtime(item): - return item["time"] - - def item_urls(self, item, board): - urls = set() - - if "com" in item and item["com"]: - urls.update(get_links_from_body(item["com"])) - elif "sub" in item and item["sub"]: - urls.update(get_links_from_body(item["sub"])) - if "fsize" in item and item["fsize"]: - urls.add(self.image_url(board, item["tim"], item["ext"])) - - return list(urls) - - @staticmethod - def item_type(item): - return "thread" if "sub" in item else "post" - - @staticmethod - def thread_mtime(thread): - return thread["last_modified"] - - @staticmethod - def parse_threads_list(r): - try: - j = json.loads(r.text) - if len(j) == 0 or "threads" not in j[0]: - logger.warning("No threads in response for %s: %s" % (r.url, r.text,)) - return [], None - except JSONDecodeError: - logger.warning("JSONDecodeError for %s:" % (r.url,)) - logger.warning(r.text) - return [], None - - threads = [] - for page in j: - for thread in page["threads"]: - threads.append(thread) - return threads, None - - @staticmethod - def parse_thread(r): - j = json.loads(r.text) - return j["posts"] - - -class RussianJsonChanHelper(ChanHelper): - - @staticmethod - def item_id(item): - return int(item["num"]) - - @staticmethod - def item_mtime(item): - return item["timestamp"] - - @staticmethod - def parse_threads_list(r): - try: - j = json.loads(r.text) - except JSONDecodeError: - logger.warning("JSONDecodeError for %s:" % (r.url,)) - logger.warning(r.text) - return [], None - return j["threads"], None - - @staticmethod - def parse_thread(r): - j = json.loads(r.text) - for thread in j["threads"]: - for post in thread["posts"]: - yield post - - @staticmethod - def thread_mtime(thread): - return thread["posts_count"] - - @staticmethod - def item_type(item): - return "thread" if "subject" in item and item["subject"] != "" else "post" - - def item_urls(self, item, board): - urls = set() - - if "comment" in item and item["comment"]: - urls.update(get_links_from_body(item["comment"])) - elif "subject" in item and item["subject"]: - urls.update(get_links_from_body(item["subject"])) - - for file in item["files"]: - urls.add(self._base_url.rstrip("/") + file["path"]) - - return list(urls) - - -class AlokalJsonChanHelper(JsonChanHelper): - - def item_urls(self, item, board): - urls = set() - - if "com" in item and item["com"]: - urls.update(get_links_from_body(item["com"])) - elif "sub" in item and item["sub"]: - urls.update(get_links_from_body(item["sub"])) - if "fsize" in item and item["fsize"]: - urls.add(self._image_url + self._image_path + item["tim"] + "/" + str(item["no"]) + item["ext"]) - - return list(urls) - - -class SynchJsonChanHelper(JsonChanHelper): - - def item_urls(self, item, board): - urls = set() - - if "com" in item and item["com"]: - urls.update(get_links_from_body(item["com"])) - elif "sub" in item and item["sub"]: - urls.update(get_links_from_body(item["sub"])) - if "fsize" in item and item["fsize"]: - urls.add(self._image_url + self._image_path + item["tim"] + item["ext"]) - - return list(urls) - - -class MayuriChanHelper(ChanHelper): - - def __init__(self, db_id, base_url, image_url, boards, rps): - super().__init__(db_id, base_url, image_url, None, None, boards, rps) - - @staticmethod - def item_id(item): - return item["id"] - - @staticmethod - def item_mtime(item): - return item["timestamp"] - - @staticmethod - def thread_mtime(thread): - return thread["replies_count"] - - def item_urls(self, item, board): - urls = set() - - if "message" in item and item["message"]: - urls.update(get_links_from_body(item["message"])) - elif "subject" in item and item["subject"]: - urls.update(get_links_from_body(item["subject"])) - if item["files"]: - for file in item["files"]: - urls.add(self._image_url % file["storage"] + file["name"] + "." + file["ext"]) - - return list(urls) - - @staticmethod - def item_type(item): - return "thread" if "replies_count" in item else "post" - - def parse_threads_list(self, r): - try: - j = json.loads(r.text) - except JSONDecodeError: - logger.warning("JSONDecodeError for %s:" % (r.url,)) - logger.warning(r.text) - return [], None - if j["currentPage"] < j["totalPages"]: - return j["data"], self._base_url + "boards/%d" % (j["currentPage"] + 1, ) - return j["data"] - - @staticmethod - def parse_thread(r): - try: - j = json.loads(r.text) - except JSONDecodeError: - logger.warning("JSONDecodeError for %s:" % (r.url,)) - logger.warning(r.text) - return [] - - thread = dict(j["data"]) - del thread["replies"] - yield thread - - if j["data"]["replies"]: - for post in j["data"]["replies"]: - yield post - - def threads_url(self, board): - return "%sboards/1" % (self._base_url, ) - - def posts_url(self, board, thread): - return "%sthreads/%d" % (self._base_url, thread) - - -CHANS = { - "4chan": JsonChanHelper( - 1, - "https://a.4cdn.org/", - "https://i.4cdn.org/", - "/thread/", - "/", - ( - "a", "b", "c", "d", "e", "f", "g", "gif", "h", "hr", - "k", "m", "o", "p", "r", "s", "t", "u", "v", "vg", - "vr", "w", "wg", "i", "ic", "r9k", "s4s", "vip", "qa", - "cm", "hm", "lgbt", "y", "3", "aco", "adv", "an", "asp", - "bant", "biz", "cgl", "ck", "co", "diy", "fa", "fit", - "gd", "hc", "his", "int", "jp", "lit", "mlp", "mu", "n", - "news", "out", "po", "pol", "qst", "sci", "soc", "sp", - "tg", "toy", "trv", "tv", "vp", "wsg", "wsr", "x" - ), - rps=2 - ), - "lainchan": JsonChanHelper( - 2, - "https://lainchan.org/", - "https://lainchan.org/", - "/res/", - "/src/", - ( - "λ", "diy", "sec", "tech", "inter", "lit", "music", "vis", - "hum", "drg", "zzz", "layer", "q", "r", "_cult", "_psy", - "_mega", - ), - rps=1 / 60 - ), - "uboachan": JsonChanHelper( - 3, - "https://uboachan.net/", - "https://uboachan.net/", - "/res/", - "/src/", - ( - "yn", "yndd", "fg", "yume", "o", "lit", "media", "og", - "ig", "2", "ot", "hikki", "cc", "x", "sugg" - ), - rps=1 / 120 - ), - "22chan": JsonChanHelper( - 4, - "https://22chan.org/", - "https://22chan.org/", - "/res/", - "/src/", - ( - "a", "b", "f", "yu", "i", "k", "mu", "pol", "sewers", - "sg", "t", "vg" - ), - rps=1 / 120 - ), - "wizchan": JsonChanHelper( - 5, - "https://wizchan.org/", - "https://wizchan.org/", - "/res/", - "/src/", - ( - "wiz", "dep", "hob", "lounge", "jp", "meta", "games", "music", - ), - rps=1 / 30 - ), - # TODO - "1chan": ChanHelper( - 6, - "https://www.1chan.net/", - "https://www.1chan.net/", - "/res/", - "/src/", - ( - "rails" - ), - rps=1 / 600 - ), - "2chhk": RussianJsonChanHelper( - 7, - "https://2ch.hk/", - "https://2ch.hk/", - "/res/", - "/src/", - ( - "d", "b", "o", "soc", "media", "r", "api", "rf", "int", - "po", "news", "hry", "au", "bi", "biz", "bo", "c", "em", - "fa", "fiz", "fl", "ftb", "hh", "hi", "me", "mg", "mlp", - "mo", "mov", "mu", "ne", "psy", "re", - "sci", "sf", "sn", "sp", "spc", "tv", "un", "w", "wh", - "wm", "wp", "zog", "de", "di", "diy", "mus", "pa", "p", - "wrk", "trv", "gd", "hw", "mobi", "pr", "ra", "s", "t", - "web", "bg", "cg", "gsg", "ruvn", "tes", "v", "vg", "wr", - "a", "fd", "ja", "ma", "vn", "fg", "fur", "gg", "ga", - "vape", "h", "ho", "hc", "e", "fet", "sex", "fag" - ), - rps=1 - ), - "endchan": HtmlChanHelper( - 8, - "https://endchan.net/", - "https://endchan.net/", - "/res/", - "/.media/", - ( - "art", "film", "oekaki", "draw", - "adv", "r9k", "hope", "spoon", - "a", "am", "amr", "l", "monster", "m", "2hu", "animach", - "b", "webm", "v", "vvv", "vidya", "tg", "otomad", "mu", - "metal", "tv", "f", "clipuploads", - "4", "deutsch", "j", "jp" "italia", "fr", "kc", "kurenai", "int", - "intl", "lang", "librejp", "rzabczan", "55chan", - "pol", "pdfs", "his", "ggrevols", "horror", "aethism", - "tech", "g", "markov", "os", "agdg", "cyber", "HTML", "2600", - "ausneets", "qanonresearch", "polru", "yuri", "christianity", - "kc", "rapport", "news", "brit", "webm", "4chon" - ), - rps=1 - ), - "38chan": JsonChanHelper( - 9, - "http://38chan.net/", - "http://38chan.net/", - "/res/", - "/src/", - ( - "a", "b", "g", "38" - ), - rps=1 / 600 - ), - "alokal": AlokalJsonChanHelper( - 10, - "https://alokal.eu/", - "https://alokal.eu/", - "/", - "src/", - ( - "b", "pol", "sk", "int", "slav", "s", "gv", "mda", "sp", - "fit", "had", - ), - rps=1 / 4 - ), - "gnfos": JsonChanHelper( - 11, - "https://gnfos.com/", - "https://gnfos.com/", - "/res/", - "/src/", - ( - "jp", "drive" - ), - rps=1 / 60 - ), - "synch": SynchJsonChanHelper( - 12, - "https://syn-ch.ru/", - "https://cdn.syn-ch.ru/", - "/res/", - "src", - ( - "b", "d", "_r", "a", "_g", "mlp", "mu", "_tv", "vg", - "_wh", "old", "test" - ), - rps=1 / 120 - ), - "tahta": JsonChanHelper( - 13, - "https://tahta.ch/", - "https://tahta.ch/", - "/res/", - "/src/", - ( - "b", "g", "s", "v" - ), - rps=1 / 300 - ), - "awsumchan": JsonChanHelper( - 14, - "https://awsumchan.org/", - "https://awsumchan.org/", - "/res/", - "/src/", - ( - "an", "aw", "cr", "fi", "ra", "au", "ga", "he", "sp" - ), - rps=1 / 600 - ), - "horochan": MayuriChanHelper( - 15, - "https://api.horochan.ru/v1/", - "https://%s.horochan.ru/src/", - ( - "b" - ), - rps=1/20 - ), -} diff --git a/chan/__init__.py b/chan/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/chan/alokal_json.py b/chan/alokal_json.py new file mode 100644 index 0000000..421062a --- /dev/null +++ b/chan/alokal_json.py @@ -0,0 +1,17 @@ +from chan.json import JsonChanHelper +from post_process import get_links_from_body + + +class AlokalJsonChanHelper(JsonChanHelper): + + def item_urls(self, item, board): + urls = set() + + if "com" in item and item["com"]: + urls.update(get_links_from_body(item["com"])) + elif "sub" in item and item["sub"]: + urls.update(get_links_from_body(item["sub"])) + if "fsize" in item and item["fsize"]: + urls.add(self._image_url + self._image_path + item["tim"] + "/" + str(item["no"]) + item["ext"]) + + return list(urls) diff --git a/chan/chan.py b/chan/chan.py new file mode 100644 index 0000000..e473c9e --- /dev/null +++ b/chan/chan.py @@ -0,0 +1,217 @@ +from chan.alokal_json import AlokalJsonChanHelper +from chan.doushio_html import DoushioHtmlChanHelper +from chan.endchan_html import EndchanHtmlChanHelper +from chan.json import JsonChanHelper +from chan.mayuri import MayuriChanHelper +from chan.russian_json import RussianJsonChanHelper +from chan.synch_json import SynchJsonChanHelper + +CHANS = { + "4chan": JsonChanHelper( + 1, + "https://a.4cdn.org/", + "https://i.4cdn.org/", + "/thread/", + "/", + ( + "a", "b", "c", "d", "e", "f", "g", "gif", "h", "hr", + "k", "m", "o", "p", "r", "s", "t", "u", "v", "vg", + "vr", "w", "wg", "i", "ic", "r9k", "s4s", "vip", "qa", + "cm", "hm", "lgbt", "y", "3", "aco", "adv", "an", "asp", + "bant", "biz", "cgl", "ck", "co", "diy", "fa", "fit", + "gd", "hc", "his", "int", "jp", "lit", "mlp", "mu", "n", + "news", "out", "po", "pol", "qst", "sci", "soc", "sp", + "tg", "toy", "trv", "tv", "vp", "wsg", "wsr", "x" + ), + rps=2 + ), + "lainchan": JsonChanHelper( + 2, + "https://lainchan.org/", + "https://lainchan.org/", + "/res/", + "/src/", + ( + "λ", "diy", "sec", "tech", "inter", "lit", "music", "vis", + "hum", "drg", "zzz", "layer", "q", "r", "_cult", "_psy", + "_mega", + ), + rps=1 / 60 + ), + "uboachan": JsonChanHelper( + 3, + "https://uboachan.net/", + "https://uboachan.net/", + "/res/", + "/src/", + ( + "yn", "yndd", "fg", "yume", "o", "lit", "media", "og", + "ig", "2", "ot", "hikki", "cc", "x", "sugg" + ), + rps=1 / 120 + ), + "22chan": JsonChanHelper( + 4, + "https://22chan.org/", + "https://22chan.org/", + "/res/", + "/src/", + ( + "a", "b", "f", "yu", "i", "k", "mu", "pol", "sewers", + "sg", "t", "vg" + ), + rps=1 / 120 + ), + "wizchan": JsonChanHelper( + 5, + "https://wizchan.org/", + "https://wizchan.org/", + "/res/", + "/src/", + ( + "wiz", "dep", "hob", "lounge", "jp", "meta", "games", "music", + ), + rps=1 / 30 + ), + # TODO + # "1chan": ChanHelper( + # 6, + # "https://www.1chan.net/", + # "https://www.1chan.net/", + # "/res/", + # "/src/", + # ( + # "rails" + # ), + # rps=1 / 600 + # ), + "2chhk": RussianJsonChanHelper( + 7, + "https://2ch.hk/", + "https://2ch.hk/", + "/res/", + "/src/", + ( + "d", "b", "o", "soc", "media", "r", "api", "rf", "int", + "po", "news", "hry", "au", "bi", "biz", "bo", "c", "em", + "fa", "fiz", "fl", "ftb", "hh", "hi", "me", "mg", "mlp", + "mo", "mov", "mu", "ne", "psy", "re", + "sci", "sf", "sn", "sp", "spc", "tv", "un", "w", "wh", + "wm", "wp", "zog", "de", "di", "diy", "mus", "pa", "p", + "wrk", "trv", "gd", "hw", "mobi", "pr", "ra", "s", "t", + "web", "bg", "cg", "gsg", "ruvn", "tes", "v", "vg", "wr", + "a", "fd", "ja", "ma", "vn", "fg", "fur", "gg", "ga", + "vape", "h", "ho", "hc", "e", "fet", "sex", "fag" + ), + rps=1 + ), + "endchan": EndchanHtmlChanHelper( + 8, + "https://endchan.net/", + "https://endchan.net/", + "/res/", + "/.media/", + ( + "art", "film", "oekaki", "draw", + "adv", "r9k", "hope", "spoon", + "a", "am", "amr", "l", "monster", "m", "2hu", "animach", + "b", "webm", "v", "vvv", "vidya", "tg", "otomad", "mu", + "metal", "tv", "f", "clipuploads", + "4", "deutsch", "j", "jp" "italia", "fr", "kc", "kurenai", "int", + "intl", "lang", "librejp", "rzabczan", "55chan", + "pol", "pdfs", "his", "ggrevols", "horror", "aethism", + "tech", "g", "markov", "os", "agdg", "cyber", "HTML", "2600", + "ausneets", "qanonresearch", "polru", "yuri", "christianity", + "kc", "rapport", "news", "brit", "webm", "4chon" + ), + rps=1 + ), + "38chan": JsonChanHelper( + 9, + "http://38chan.net/", + "http://38chan.net/", + "/res/", + "/src/", + ( + "a", "b", "g", "38" + ), + rps=1 / 600 + ), + "alokal": AlokalJsonChanHelper( + 10, + "https://alokal.eu/", + "https://alokal.eu/", + "/", + "src/", + ( + "b", "pol", "sk", "int", "slav", "s", "gv", "mda", "sp", + "fit", "had", + ), + rps=1 / 4 + ), + "gnfos": JsonChanHelper( + 11, + "https://gnfos.com/", + "https://gnfos.com/", + "/res/", + "/src/", + ( + "jp", "drive" + ), + rps=1 / 60 + ), + "synch": SynchJsonChanHelper( + 12, + "https://syn-ch.ru/", + "https://cdn.syn-ch.ru/", + "/res/", + "src", + ( + "b", "d", "_r", "a", "_g", "mlp", "mu", "_tv", "vg", + "_wh", "old", "test" + ), + rps=1 / 120 + ), + "tahta": JsonChanHelper( + 13, + "https://tahta.ch/", + "https://tahta.ch/", + "/res/", + "/src/", + ( + "b", "g", "s", "v" + ), + rps=1 / 300 + ), + "awsumchan": JsonChanHelper( + 14, + "https://awsumchan.org/", + "https://awsumchan.org/", + "/res/", + "/src/", + ( + "an", "aw", "cr", "fi", "ra", "au", "ga", "he", "sp" + ), + rps=1 / 600 + ), + "horochan": MayuriChanHelper( + 15, + "https://api.horochan.ru/v1/", + "https://%s.horochan.ru/src/", + ( + "b", + ), + rps=1 / 20 + ), + "doushio": DoushioHtmlChanHelper( + 16, + "http://doushio.com/", + "http://doushio.com/", + "", + "/ass/", + ( + "moe", + ), + rps=1 + ) +} diff --git a/chan/doushio_html.py b/chan/doushio_html.py new file mode 100644 index 0000000..add3de6 --- /dev/null +++ b/chan/doushio_html.py @@ -0,0 +1,75 @@ +from urllib.parse import urljoin +from dateutil import parser + +from bs4 import BeautifulSoup + +from chan.helper import ChanHelper +from post_process import get_links_from_html_body + + +class DoushioHtmlChanHelper(ChanHelper): + + def threads_url(self, board): + return "%s%s/" % (self._base_url, board) + + def posts_url(self, board, thread): + return "%s%s/%d" % (self._base_url, board, thread) + + @staticmethod + def item_id(item): + return item["id"] + + def item_urls(self, item, board): + return list(set(get_links_from_html_body(item["html"], self._base_url))) + + @staticmethod + def item_type(item): + return item["type"] + + @staticmethod + def thread_mtime(thread): + return -1 + + @staticmethod + def item_mtime(item): + return item["time"] + + def parse_threads_list(self, r): + soup = BeautifulSoup(r.text, "html.parser") + + threads = [] + + for threadEl in soup.find_all("section"): + threads.append({ + "id": int(threadEl.get("id")), + }) + + next_url = soup.find("link", attrs={"rel": "next"}) + if next_url: + return threads, urljoin(r.url, next_url.get("href")) + return threads, None + + @staticmethod + def parse_thread(r): + soup = BeautifulSoup(r.text, "html.parser") + + op_el = soup.find("section") + for post_el in op_el.find_all("article"): + yield { + "id": int(post_el.get("id")), + "type": "post", + "html": str(post_el), + "time": int(parser.parse(post_el.find("header").find("time").get("datetime")).timestamp()) + } + post_el.decompose() + yield { + "id": int(op_el.get("id")), + "type": "thread", + "html": str(op_el), + "time": int(parser.parse(op_el.find("header").find("time").get("datetime")).timestamp()) + } + + + + + diff --git a/chan/endchan_html.py b/chan/endchan_html.py new file mode 100644 index 0000000..1ba5778 --- /dev/null +++ b/chan/endchan_html.py @@ -0,0 +1,74 @@ +import datetime +from urllib.parse import urljoin + +from bs4 import BeautifulSoup + +from chan.helper import ChanHelper +from post_process import get_links_from_html_body + + +class EndchanHtmlChanHelper(ChanHelper): + + def threads_url(self, board): + return "%s%s/" % (self._base_url, board) + + def posts_url(self, board, thread): + return "%s%s%s%d.html" % (self._base_url, board, self._thread_path, thread) + + @staticmethod + def item_id(item): + return item["id"] + + def item_urls(self, item, board): + return list(set(get_links_from_html_body(item["html"], self._base_url))) + + @staticmethod + def item_type(item): + return item["type"] + + @staticmethod + def thread_mtime(thread): + return -1 + + @staticmethod + def item_mtime(item): + return item["time"] + + def parse_threads_list(self, r): + soup = BeautifulSoup(r.text, "html.parser") + + threads = [] + + for threadEl in soup.find_all("div", attrs={"class": "opCell"}): + threads.append({ + "id": int(threadEl.get("id")), + }) + + next_url = soup.find("a", attrs={"id": "linkNext"}) + if next_url: + return threads, urljoin(r.url, next_url.get("href")) + return threads, None + + @staticmethod + def parse_thread(r): + soup = BeautifulSoup(r.text, "html.parser") + + op_el = soup.find("div", attrs={"class": "innerOP"}) + if not op_el: + return [] + yield { + "id": int(soup.find("div", class_="opCell").get("id")), + "type": "thread", + "html": str(op_el), + "time": int(datetime.datetime.strptime(op_el.find("span", class_="labelCreated").text, + "%m/%d/%Y (%a) %H:%M:%S").timestamp()) + } + + for post_el in soup.find_all("div", class_="postCell"): + yield { + "id": int(post_el.get("id")), + "type": "post", + "html": str(post_el), + "time": int(datetime.datetime.strptime(post_el.find("span", class_="labelCreated").text, + "%m/%d/%Y (%a) %H:%M:%S").timestamp()) + } diff --git a/chan/helper.py b/chan/helper.py new file mode 100644 index 0000000..0dc13f2 --- /dev/null +++ b/chan/helper.py @@ -0,0 +1,75 @@ +from bs4 import BeautifulSoup + + +class ChanHelper: + def __init__(self, db_id, base_url, image_url, thread_path, image_path, boards, rps): + self.db_id = db_id + self._base_url = base_url + self._image_url = image_url + self._thread_path = thread_path + self._image_path = image_path + self._boards = boards + self.rps = rps + + def boards(self): + return [b for b in self._boards if not b.startswith("_")] + + def image_url(self, board, tim, extension): + return "%s%s%s%s%s" % (self._image_url, board, self._image_path, tim, extension) + + def threads_url(self, board): + return "%s%s/threads.json" % (self._base_url, board) + + def posts_url(self, board, thread): + return "%s%s%s%d.json" % (self._base_url, board, self._thread_path, thread) + + def board_hash(self, board): + return str((self._boards.index(board) + 1) * 10000) + + @staticmethod + def item_id(item): + raise NotImplementedError + + @staticmethod + def item_mtime(item): + raise NotImplementedError + + def item_unique_id(self, item, board): + return int(self.board_hash(board) + str(self.item_id(item))) + + @staticmethod + def thread_mtime(thread): + raise NotImplementedError + + def item_urls(self, item, board): + raise NotImplementedError + + @staticmethod + def item_type(item): + raise NotImplementedError + + @staticmethod + def parse_threads_list(r): + raise NotImplementedError + + @staticmethod + def parse_thread(r): + raise NotImplementedError + + @staticmethod + def parse_thread(r): + soup = BeautifulSoup(r.text, "html.parser") + + op_el = soup.find("div", attrs={"class": "innerOP"}) + yield { + "id": int(soup.find("div", class_="opCell").get("id")), + "type": "thread", + "html": str(op_el), + } + + for post_el in soup.find_all("div", class_="postCell"): + yield { + "id": int(post_el.get("id")), + "type": "post", + "html": str(post_el), + } diff --git a/chan/json.py b/chan/json.py new file mode 100644 index 0000000..7e80104 --- /dev/null +++ b/chan/json.py @@ -0,0 +1,60 @@ +import json +from json import JSONDecodeError + +from chan.helper import ChanHelper +from post_process import get_links_from_body +from util import logger + + +class JsonChanHelper(ChanHelper): + + @staticmethod + def item_id(item): + return item["no"] + + @staticmethod + def item_mtime(item): + return item["time"] + + def item_urls(self, item, board): + urls = set() + + if "com" in item and item["com"]: + urls.update(get_links_from_body(item["com"])) + elif "sub" in item and item["sub"]: + urls.update(get_links_from_body(item["sub"])) + if "fsize" in item and item["fsize"]: + urls.add(self.image_url(board, item["tim"], item["ext"])) + + return list(urls) + + @staticmethod + def item_type(item): + return "thread" if "sub" in item else "post" + + @staticmethod + def thread_mtime(thread): + return thread["last_modified"] + + @staticmethod + def parse_threads_list(r): + try: + j = json.loads(r.text) + if len(j) == 0 or "threads" not in j[0]: + logger.warning("No threads in response for %s: %s" % (r.url, r.text,)) + return [], None + except JSONDecodeError: + logger.warning("JSONDecodeError for %s:" % (r.url,)) + logger.warning(r.text) + return [], None + + threads = [] + for page in j: + for thread in page["threads"]: + threads.append(thread) + return threads, None + + @staticmethod + def parse_thread(r): + j = json.loads(r.text) + return j["posts"] diff --git a/chan/mayuri.py b/chan/mayuri.py new file mode 100644 index 0000000..3a47b00 --- /dev/null +++ b/chan/mayuri.py @@ -0,0 +1,75 @@ +import json +from json import JSONDecodeError + +from chan.helper import ChanHelper +from post_process import get_links_from_body +from util import logger + + +class MayuriChanHelper(ChanHelper): + + def __init__(self, db_id, base_url, image_url, boards, rps): + super().__init__(db_id, base_url, image_url, None, None, boards, rps) + + @staticmethod + def item_id(item): + return item["id"] + + @staticmethod + def item_mtime(item): + return item["timestamp"] + + @staticmethod + def thread_mtime(thread): + return thread["replies_count"] + + def item_urls(self, item, board): + urls = set() + + if "message" in item and item["message"]: + urls.update(get_links_from_body(item["message"])) + elif "subject" in item and item["subject"]: + urls.update(get_links_from_body(item["subject"])) + if item["files"]: + for file in item["files"]: + urls.add(self._image_url % file["storage"] + file["name"] + "." + file["ext"]) + + return list(urls) + + @staticmethod + def item_type(item): + return "thread" if "replies_count" in item else "post" + + def parse_threads_list(self, r): + try: + j = json.loads(r.text) + except JSONDecodeError: + logger.warning("JSONDecodeError for %s:" % (r.url,)) + logger.warning(r.text) + return [], None + if j["currentPage"] < j["totalPages"]: + return j["data"], self._base_url + "boards/%d" % (j["currentPage"] + 1,) + return j["data"] + + @staticmethod + def parse_thread(r): + try: + j = json.loads(r.text) + except JSONDecodeError: + logger.warning("JSONDecodeError for %s:" % (r.url,)) + logger.warning(r.text) + return [] + + thread = dict(j["data"]) + del thread["replies"] + yield thread + + if j["data"]["replies"]: + for post in j["data"]["replies"]: + yield post + + def threads_url(self, board): + return "%sboards/1" % (self._base_url,) + + def posts_url(self, board, thread): + return "%sthreads/%d" % (self._base_url, thread) diff --git a/chan/russian_json.py b/chan/russian_json.py new file mode 100644 index 0000000..df40a56 --- /dev/null +++ b/chan/russian_json.py @@ -0,0 +1,55 @@ +import json +from json import JSONDecodeError + +from chan.helper import ChanHelper +from post_process import get_links_from_body +from util import logger + + +class RussianJsonChanHelper(ChanHelper): + + @staticmethod + def item_id(item): + return int(item["num"]) + + @staticmethod + def item_mtime(item): + return item["timestamp"] + + @staticmethod + def parse_threads_list(r): + try: + j = json.loads(r.text) + except JSONDecodeError: + logger.warning("JSONDecodeError for %s:" % (r.url,)) + logger.warning(r.text) + return [], None + return j["threads"], None + + @staticmethod + def parse_thread(r): + j = json.loads(r.text) + for thread in j["threads"]: + for post in thread["posts"]: + yield post + + @staticmethod + def thread_mtime(thread): + return thread["posts_count"] + + @staticmethod + def item_type(item): + return "thread" if "subject" in item and item["subject"] != "" else "post" + + def item_urls(self, item, board): + urls = set() + + if "comment" in item and item["comment"]: + urls.update(get_links_from_body(item["comment"])) + elif "subject" in item and item["subject"]: + urls.update(get_links_from_body(item["subject"])) + + for file in item["files"]: + urls.add(self._base_url.rstrip("/") + file["path"]) + + return list(urls) diff --git a/chan/synch_json.py b/chan/synch_json.py new file mode 100644 index 0000000..0e6d099 --- /dev/null +++ b/chan/synch_json.py @@ -0,0 +1,18 @@ +from chan.json import JsonChanHelper +from post_process import get_links_from_body + + +class SynchJsonChanHelper(JsonChanHelper): + + def item_urls(self, item, board): + urls = set() + + if "com" in item and item["com"]: + urls.update(get_links_from_body(item["com"])) + elif "sub" in item and item["sub"]: + urls.update(get_links_from_body(item["sub"])) + if "fsize" in item and item["fsize"]: + urls.add(self._image_url + self._image_path + item["tim"] + item["ext"]) + + return list(urls) + diff --git a/post_process.py b/post_process.py index 0707b31..7b88bd5 100644 --- a/post_process.py +++ b/post_process.py @@ -3,6 +3,7 @@ import hashlib import re import zlib from io import BytesIO +from urllib.parse import urljoin import imagehash from PIL import Image @@ -10,6 +11,7 @@ from PIL import Image from util import logger LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))") +HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"") IMAGE_FILETYPES = ( # :orig for twitter cdn @@ -71,7 +73,7 @@ def image_meta(url, url_idx, web): def post_process(item, board, helper, web): - item["_v"] = 1.4 + item["_v"] = 1.5 item["_id"] = helper.item_unique_id(item, board) item["_board"] = board @@ -100,5 +102,13 @@ def get_links_from_body(body): return result +def get_links_from_html_body(body, base_url): + result = [] + for match in HTML_HREF_RE.finditer(body): + url = match.group(1) + result.append(urljoin(base_url, url)) + return result + + def is_external(url): return not url.startswith(("#", "/")) diff --git a/run.py b/run.py index b73d942..52a7a47 100644 --- a/run.py +++ b/run.py @@ -10,7 +10,7 @@ from threading import Thread import pika import monitoring -from chan import CHANS +from chan.chan import CHANS from post_process import post_process from util import logger, Web