diff --git a/chan/chan.py b/chan/chan.py index 8638200..e38765e 100644 --- a/chan/chan.py +++ b/chan/chan.py @@ -9,6 +9,7 @@ from chan.mayuri import MayuriChanHelper from chan.russian_json import RussianJsonChanHelper from chan.synch_json import SynchJsonChanHelper from chan.tgchan_html import TgChanHtmlChanHelper +from chan.zerochan_html import ZerochanHtmlChanHelper CHANS = { "4chan": JsonChanHelper( @@ -75,7 +76,7 @@ CHANS = { ( "wiz", "dep", "hob", "lounge", "jp", "meta", "games", "music", ), - rps=1 / 30 + rps=1 / 60 ), # TODO # "1chan": ChanHelper( @@ -107,7 +108,7 @@ CHANS = { "a", "fd", "ja", "ma", "vn", "fg", "fur", "gg", "ga", "vape", "h", "ho", "hc", "e", "fet", "sex", "fag" ), - rps=1/5 + rps=1 / 5 ), "endchan": EndchanHtmlChanHelper( 8, @@ -128,7 +129,7 @@ CHANS = { "ausneets", "qanonresearch", "polru", "yuri", "christianity", "kc", "rapport", "news", "brit", "webm", "4chon" ), - rps=1/2 + rps=1 / 10 ), "38chan": JsonChanHelper( 9, @@ -151,7 +152,7 @@ CHANS = { "b", "pol", "sk", "int", "slav", "s", "gv", "mda", "sp", "fit", "had", ), - rps=1 / 30 + rps=1 / 60 ), "gnfos": JsonChanHelper( 11, @@ -216,7 +217,7 @@ CHANS = { ( "moe", ), - rps=1/20 + rps=1 / 20 ), "desuchan": DesuChanHtmlChanHelper( 17, @@ -232,7 +233,7 @@ CHANS = { "arrrrr", "brocastan", "gar", "gif", "media", "ot", "r", "w", "sandbox", "sugg" ), - rps=1/10 + rps=1 / 30 ), "aurorachan": DesuChanHtmlChanHelper( 18, @@ -244,7 +245,7 @@ CHANS = { "_bm", "de", "ic", "rp", "rpi", "v", "w", "tg", "alt", "b", "g", "pkmn", "yuri", "fl", "mu", "sugg" ), - rps=1/20 + rps=1 / 20 ), "tgchan": TgChanHtmlChanHelper( 19, @@ -255,7 +256,7 @@ CHANS = { ( "draw", "meep", "quest", "questdis", "tg", "icons", ), - rps=1/600, + rps=1 / 600, ), "lolnada": LolNadaHtmlChanHelper( 20, @@ -267,7 +268,7 @@ CHANS = { "b", "a", "aw", "cgl", "dw", "int", "qt", "sad", "t", "toy", "v", "x", "34", "e", "f", "h" ), - rps=1/20, + rps=1 / 60, ), "fchan": FChanHtmlChanHelper( 21, @@ -278,6 +279,22 @@ CHANS = { ( "f", "m", "h", "s", "toon", "a", "ah", "c", "artist", "crit", "b" ), - rps=1/60, + rps=1 / 60, ), + "0chan": ZerochanHtmlChanHelper( + 22, + "https://0-chan.ru/", + "https://0-chan.ru/", + "", + "/assets/", + ( + "0", "0ch", "0chan", "1chan", "2ch", "3dprintor", "8", "\\_b", "a", + "an", "asylum", "bb", "bo", "c", "copypaste", "dog", "draw", "e", + "elite", "eot", "ergrgergre", "fido", "fur", "g", "game", "hui", "huz", + "hw", "ithub", "m", "meta", "naotoudigu", "nhc", "nullchan", "parasha", + "poligon", "postach", "psih", "r", "rm", "s", "shrek", "shy", "t", + "test", "tlp", "tmp", "tv", "vg", "vipe", "wh", "xikkadvach", "ynet" + ), + rps=1 / 5 + ) } diff --git a/chan/desuchan_html.py b/chan/desuchan_html.py index dcba0ab..351f49a 100644 --- a/chan/desuchan_html.py +++ b/chan/desuchan_html.py @@ -27,20 +27,22 @@ class DesuChanHtmlChanHelper(ChanHelper): @staticmethod def thread_mtime(thread): - return -1 # TODO: Parse the 'X posts, Y images' span + return thread["omit"] @staticmethod def item_mtime(item): return item["time"] def parse_threads_list(self, r): - soup = BeautifulSoup(r.text, "html.parser") + soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'), "html.parser") threads = [] for threadEl in soup.find_all("div", id=lambda tid: tid and tid[1:].isdigit()): + omit = threadEl.find("span", class_="omittedposts") threads.append({ "id": int(threadEl.get("id")[1:]), + "omit": int(omit.text.split(" ")[0]) if omit else 0 }) for form in soup.find_all("form"): @@ -51,7 +53,7 @@ class DesuChanHtmlChanHelper(ChanHelper): @staticmethod def parse_thread(r): - soup = BeautifulSoup(r.text, "html.parser") + soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'), "html.parser") op_el = soup.find("div", id=lambda tid: tid and tid[1:].isdigit()) diff --git a/chan/doushio_html.py b/chan/doushio_html.py index a39dc01..7649a0f 100644 --- a/chan/doushio_html.py +++ b/chan/doushio_html.py @@ -28,20 +28,22 @@ class DoushioHtmlChanHelper(ChanHelper): @staticmethod def thread_mtime(thread): - return -1 + return thread["omit"] @staticmethod def item_mtime(item): return item["time"] def parse_threads_list(self, r): - soup = BeautifulSoup(r.text, "html.parser") + soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'), "html.parser") threads = [] for threadEl in soup.find_all("section"): + omit = threadEl.find("span", class_="omit") threads.append({ "id": int(threadEl.get("id")), + "omit": int(omit.text.split(" ")[0]) if omit else 0 }) next_url = soup.find("link", attrs={"rel": "next"}) @@ -51,7 +53,7 @@ class DoushioHtmlChanHelper(ChanHelper): @staticmethod def parse_thread(r): - soup = BeautifulSoup(r.text, "html.parser") + soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'), "html.parser") op_el = soup.find("section") for post_el in op_el.find_all("article"): diff --git a/chan/endchan_html.py b/chan/endchan_html.py index 70869e5..6e34236 100644 --- a/chan/endchan_html.py +++ b/chan/endchan_html.py @@ -28,20 +28,22 @@ class EndchanHtmlChanHelper(ChanHelper): @staticmethod def thread_mtime(thread): - return -1 + return thread["omit"] @staticmethod def item_mtime(item): return item["time"] def parse_threads_list(self, r): - soup = BeautifulSoup(r.text, "html.parser") + soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'), "html.parser") threads = [] for threadEl in soup.find_all("div", attrs={"class": "opCell"}): + omit = threadEl.find("div", class_="labelOmission") threads.append({ "id": int(threadEl.get("id")), + "omit": int(omit.text.split(" ")[0]) if omit else 0 }) next_url = soup.find("a", attrs={"id": "linkNext"}) @@ -51,7 +53,7 @@ class EndchanHtmlChanHelper(ChanHelper): @staticmethod def parse_thread(r): - soup = BeautifulSoup(r.text, "html.parser") + soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'), "html.parser") op_el = soup.find("div", attrs={"class": "innerOP"}) if not op_el: diff --git a/chan/fchan_html.py b/chan/fchan_html.py index 3c44149..c6444fd 100644 --- a/chan/fchan_html.py +++ b/chan/fchan_html.py @@ -1,5 +1,5 @@ import datetime -import json +import _strptime import re from urllib.parse import urljoin @@ -31,7 +31,7 @@ class FChanHtmlChanHelper(DesuChanHtmlChanHelper): @staticmethod def parse_thread(r): - soup = BeautifulSoup(r.text, "html.parser") + soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'), "html.parser") op_el = soup.find("div", id=lambda tid: tid and re.match("thread[0-9]+", tid)) diff --git a/chan/helper.py b/chan/helper.py index 64546bd..e89fc29 100644 --- a/chan/helper.py +++ b/chan/helper.py @@ -12,7 +12,7 @@ class ChanHelper: self.rps = rps def boards(self): - return [b for b in self._boards if not b.startswith("_")] + return [b.replace("\\_", "_") for b in self._boards if not b.startswith("_")] def image_url(self, board, tim, extension): return "%s%s%s%s%s" % (self._image_url, board, self._image_path, tim, extension) diff --git a/chan/lolnada_html.py b/chan/lolnada_html.py index 5b10795..f229aaa 100644 --- a/chan/lolnada_html.py +++ b/chan/lolnada_html.py @@ -32,7 +32,7 @@ class LolNadaHtmlChanHelper(ChanHelper): @staticmethod def thread_mtime(thread): - return -1 + return thread["omit"] @staticmethod def item_mtime(item): @@ -44,9 +44,11 @@ class LolNadaHtmlChanHelper(ChanHelper): threads = [] for threadEl in soup.find_all("div", class_="hilo"): + omit = threadEl.find("span", class_="omitted") threads.append({ "id": int(threadEl.get("data-id")), "url": threadEl.find("a", class_="post_no").get("href"), + "omit": int(omit.get("data-omitidos")) if omit else 0 }) for form in soup.find_all("form"): diff --git a/chan/tgchan_html.py b/chan/tgchan_html.py index 9a65d35..58a41cb 100644 --- a/chan/tgchan_html.py +++ b/chan/tgchan_html.py @@ -10,7 +10,7 @@ from chan.desuchan_html import DesuChanHtmlChanHelper class TgChanHtmlChanHelper(DesuChanHtmlChanHelper): def parse_threads_list(self, r): - soup = BeautifulSoup(r.text, "html.parser") + soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'), "html.parser") threads = [] @@ -27,7 +27,7 @@ class TgChanHtmlChanHelper(DesuChanHtmlChanHelper): @staticmethod def parse_thread(r): - soup = BeautifulSoup(r.text, "html.parser") + soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'), "html.parser") op_el = soup.find("form", id="delform") diff --git a/chan/zerochan_html.py b/chan/zerochan_html.py new file mode 100644 index 0000000..2e52af9 --- /dev/null +++ b/chan/zerochan_html.py @@ -0,0 +1,80 @@ +import datetime +import re +from urllib.parse import urljoin + +from bs4 import BeautifulSoup + +from chan.doushio_html import DoushioHtmlChanHelper +from post_process import get_links_from_html_body + + +def _ru_datefmt(text): + # For some reason, the dates are not compatible with ru_RU.UTF-8... + + return re.sub(r"\(.{3}\)", "", text) \ + .replace("Янв", "Jan") \ + .replace("Фев", "Feb") \ + .replace("Мар", "Mar") \ + .replace("Апр", "Apr") \ + .replace("Май", "May") \ + .replace("Июн", "Jun") \ + .replace("Июл", "Jul") \ + .replace("Авг", "Aug") \ + .replace("Сеп", "Sep") \ + .replace("Окт", "Oct") \ + .replace("Ноя", "Nov") \ + .replace("Дек", "Dec") + + +class ZerochanHtmlChanHelper(DoushioHtmlChanHelper): + + @staticmethod + def thread_mtime(thread): + return thread["omit"] + + def item_urls(self, item, board): + return [ + x for + x in set(get_links_from_html_body(item["html"], self._base_url)) + if "google.com" not in x and "whatanime.ga" not in x and "iqdb.org" not in x and "saucenao.com" not in x + ] + + def parse_threads_list(self, r): + soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'), "html.parser") + + threads = [] + + for threadEl in soup.find_all("section", attrs={"data-id": lambda x: x}): + omit = threadEl.find("span", class_="omit") + threads.append({ + "id": int(threadEl.get("data-id")), + "omit": int(omit.get("data-omit")) if omit else 0 + }) + + for a in soup.find_all("a"): + if a.text == ">": + return threads, urljoin(r.url, a.get("href")) + return threads, None + + @staticmethod + def parse_thread(r): + soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'), "html.parser") + + op_el = soup.find("section", attrs={"data-id": lambda x: x}) + + for post_el in op_el.find_all("article", attrs={"data-id": lambda x: x}): + yield { + "id": int(post_el.get("data-id")), + "type": "post", + "html": str(post_el), + "time": int(datetime.datetime.strptime(_ru_datefmt(post_el.find("time").text), + "%d %b %Y %H:%M").timestamp()) + } + post_el.decompose() + yield { + "id": int(op_el.get("data-id")[1:]), + "type": "thread", + "html": str(op_el), + "time": int(datetime.datetime.strptime(_ru_datefmt(op_el.find("time").text), + "%d %b %Y %H:%M").timestamp()) + }