diff --git a/chan/chan.py b/chan/chan.py index 2c2afae..69c4ef5 100644 --- a/chan/chan.py +++ b/chan/chan.py @@ -6,6 +6,7 @@ from chan.desuchan_html import DesuChanHtmlChanHelper from chan.doushio_html import DoushioHtmlChanHelper from chan.endchan_html import EndchanHtmlChanHelper from chan.fchan_html import FChanHtmlChanHelper +from chan.iichan_html import IichanHtmlChanHelper from chan.json import JsonChanHelper from chan.lolnada_html import LolNadaHtmlChanHelper from chan.mayuri import MayuriChanHelper @@ -350,5 +351,20 @@ CHANS = { "b", "goys" ), rps=1 / 60 + ), + "iichan": IichanHtmlChanHelper( + 27, + "https://iichan.hk/", + "https://iichan.hk/", + "/res/", + "/src/", + ( + "d", "b", "bro", "ci", "cu", "dev", "gf", "hr", "l", + "m", "med", "mi", "mu", "o", "ph", "r", "s", "sci", + "tran", "tu", "tv", "x", "es", "vq", "au", "tr", "a", + "aa", "abe", "c", "fi", "jp", "rm", "tan", "to", "ts", + "vn", "vo", "misc" + ), + rps=1 / 10 ) } diff --git a/chan/chan410_html.py b/chan/chan410_html.py index 1afef70..5208bfb 100644 --- a/chan/chan410_html.py +++ b/chan/chan410_html.py @@ -32,20 +32,27 @@ class Chan410HtmlChanHelper(DesuChanHtmlChanHelper): op_el = soup.find("form", id="delform") + posts = [] for post_el in op_el.find_all("div", class_="reply"): - yield { + posts.append({ "id": int(post_el.get("id")[5:]), "type": "post", "html": str(post_el), "time": int(datetime.datetime.strptime(_ru_datefmt(op_el.find("span", class_="time").text), "%d.%m.%Y %H:%M:%S").timestamp()) - } + }) post_el.decompose() + tid = int(op_el.find("a", attrs={"name": lambda x: x and x.isdigit()}).get("name")) yield { - "id": int(op_el.find("a", attrs={"name": lambda x: x and x.isdigit()}).get("name")), + "id": tid, "type": "thread", "html": str(op_el), "time": int(datetime.datetime.strptime(_ru_datefmt(op_el.find("span", class_="time").text), "%d.%m.%Y %H:%M:%S").timestamp()) } + + for post in posts: + post["parent"] = tid + yield post + diff --git a/chan/chan7_html.py b/chan/chan7_html.py index e8f6cff..2918f20 100644 --- a/chan/chan7_html.py +++ b/chan/chan7_html.py @@ -38,8 +38,9 @@ class Chan7HtmlChanHelper(DesuChanHtmlChanHelper): thread_el = soup.find("div", id=lambda x: x and re.match("thread_[0-9]+_[a-zA-Z]*", x)) op_el = thread_el.find("div", class_="post") time = "".join(s for s in op_el.find("div", class_="post_header").contents if isinstance(s, str)) + tid = int(op_el.get("id")) yield { - "id": int(op_el.get("id")), + "id": tid, "type": "thread", "html": str(op_el), "time": int(datetime.datetime.strptime(_trim_time(time), "\n%y/%m/%d(%a)%H:%M\n").timestamp()) @@ -51,5 +52,6 @@ class Chan7HtmlChanHelper(DesuChanHtmlChanHelper): "id": int(post_el.get("id")[6:]), "type": "post", "html": str(post_el), - "time": int(datetime.datetime.strptime(_trim_time(time), "\n%y/%m/%d(%a)%H:%M\n").timestamp()) + "time": int(datetime.datetime.strptime(_trim_time(time), "\n%y/%m/%d(%a)%H:%M\n").timestamp()), + "parent": tid } diff --git a/chan/chanon_html.py b/chan/chanon_html.py index 6f3ecb2..ac485cc 100644 --- a/chan/chanon_html.py +++ b/chan/chanon_html.py @@ -44,19 +44,21 @@ class ChanonHtmlChanHelper(DesuChanHtmlChanHelper): thread_el = soup.find("div", id=lambda x: x and re.match("thread[0-9]+[a-zA-Z]*", x)) + tid = int(re.search("thread([0-9]+)[a-zA-Z]*", thread_el.get("id")).group(1)) for post_el in thread_el.find_all("table", recursive=False): *_, time = post_el.find("label").children yield { "id": int(post_el.find("td", attrs={"class", "reply"}).get("id")[5:]), "type": "post", "html": str(post_el), - "time": _ts(time, r) + "time": _ts(time, r), + "parent": tid, } post_el.decompose() *_, time = thread_el.find("label").children yield { - "id": int(re.search("thread([0-9]+)[a-zA-Z]*", thread_el.get("id")).group(1)), + "id": tid, "type": "thread", "html": str(thread_el), "time": _ts(time, r) diff --git a/chan/desuchan_html.py b/chan/desuchan_html.py index 87205db..51090f9 100644 --- a/chan/desuchan_html.py +++ b/chan/desuchan_html.py @@ -57,19 +57,21 @@ class DesuChanHtmlChanHelper(ChanHelper): op_el = soup.find("div", id=lambda tid: tid and tid[1:].isdigit()) + tid = int(op_el.get("id")[1:]) for post_el in op_el.find_all("table", recursive=False): *_, time = post_el.find("label").children yield { "id": int(post_el.find("td", attrs={"class", "reply"}).get("id")[5:]), "type": "post", "html": str(post_el), - "time": int(datetime.datetime.strptime(time, "\n%y/%m/%d(%a)%H:%M").timestamp()) + "time": int(datetime.datetime.strptime(time, "\n%y/%m/%d(%a)%H:%M").timestamp()), + "parent": tid } post_el.decompose() *_, time = op_el.find("label").children yield { - "id": int(op_el.get("id")[1:]), + "id": tid, "type": "thread", "html": str(op_el), "time": int(datetime.datetime.strptime(time, "\n%y/%m/%d(%a)%H:%M").timestamp()) diff --git a/chan/doushio_html.py b/chan/doushio_html.py index 7649a0f..5145b19 100644 --- a/chan/doushio_html.py +++ b/chan/doushio_html.py @@ -56,16 +56,18 @@ class DoushioHtmlChanHelper(ChanHelper): soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'), "html.parser") op_el = soup.find("section") + tid = int(op_el.get("id")) for post_el in op_el.find_all("article"): yield { "id": int(post_el.get("id")), "type": "post", "html": str(post_el), - "time": int(parser.parse(post_el.find("header").find("time").get("datetime")).timestamp()) + "time": int(parser.parse(post_el.find("header").find("time").get("datetime")).timestamp()), + "parent": tid } post_el.decompose() yield { - "id": int(op_el.get("id")), + "id": tid, "type": "thread", "html": str(op_el), "time": int(parser.parse(op_el.find("header").find("time").get("datetime")).timestamp()) diff --git a/chan/endchan_html.py b/chan/endchan_html.py index 6e34236..69aae5c 100644 --- a/chan/endchan_html.py +++ b/chan/endchan_html.py @@ -58,8 +58,9 @@ class EndchanHtmlChanHelper(ChanHelper): op_el = soup.find("div", attrs={"class": "innerOP"}) if not op_el: return [] + tid = int(soup.find("div", class_="opCell").get("id")) yield { - "id": int(soup.find("div", class_="opCell").get("id")), + "id": tid, "type": "thread", "html": str(op_el), "time": int(datetime.datetime.strptime(op_el.find("span", class_="labelCreated").text, @@ -72,5 +73,6 @@ class EndchanHtmlChanHelper(ChanHelper): "type": "post", "html": str(post_el), "time": int(datetime.datetime.strptime(post_el.find("span", class_="labelCreated").text, - "%m/%d/%Y (%a) %H:%M:%S").timestamp()) + "%m/%d/%Y (%a) %H:%M:%S").timestamp()), + "parent": tid } diff --git a/chan/fchan_html.py b/chan/fchan_html.py index fdae93b..2b8704f 100644 --- a/chan/fchan_html.py +++ b/chan/fchan_html.py @@ -34,22 +34,29 @@ class FChanHtmlChanHelper(DesuChanHtmlChanHelper): is_op = True + posts = [] + tid = None for post_el in op_el.find_all("table", recursive=False): label = post_el.find("label") *_, time = label.children if is_op: + tid = int(op_el.get("id")[6:]) yield { - "id": int(op_el.get("id")[6:]), + "id": tid, "type": "thread", "html": str(post_el), "time": int(datetime.datetime.strptime(time.strip(), "%y/%m/%d(%a)%H:%M").timestamp()) } is_op = False else: - yield { + posts.append({ "id": int(post_el.find("td", class_=lambda x: x and "reply" in x).get("id")[5:]), "type": "post", "html": str(post_el), "time": int(datetime.datetime.strptime(time.strip(), "%y/%m/%d(%a)%H:%M").timestamp()) - } + }) + + for post in posts: + post["parent"] = tid + yield post diff --git a/chan/iichan_html.py b/chan/iichan_html.py new file mode 100644 index 0000000..38b23b7 --- /dev/null +++ b/chan/iichan_html.py @@ -0,0 +1,78 @@ +import datetime +import re +from urllib.parse import urljoin + +from bs4 import BeautifulSoup + +from chan.desuchan_html import DesuChanHtmlChanHelper +from util import logger + + +def _ts(text): + time = re.sub(r"^\w{2} ", "", text.strip()) \ + .replace("января", "01") \ + .replace("февраля", "02") \ + .replace("марта", "03") \ + .replace("апреля", "04") \ + .replace("мая", "05") \ + .replace("июня", "06") \ + .replace("июля", "07") \ + .replace("августа", "08") \ + .replace("сентября", "09") \ + .replace("октября", "10") \ + .replace("ноября", "11") \ + .replace("декабря", "12") \ + .replace("⑨", "9") + # For some reason, some dates are fuzzed / in chinese + try: + return int(datetime.datetime.strptime(time, "%d %m %Y %H:%M:%S").timestamp()) + except Exception as e: + logger.warning("Error during date parsing (iichan): " + str(e)) + return 0 + + +class IichanHtmlChanHelper(DesuChanHtmlChanHelper): + + def parse_threads_list(self, r): + soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'), "html.parser") + + threads = [] + + for threadEl in soup.find_all("div", id=lambda tid: tid and re.match("thread-([0-9]+)$", tid)): + omit = threadEl.find("span", class_="omittedposts") + threads.append({ + "id": int(re.search("thread-([0-9]+)", threadEl.get("id")).group(1)), + "omit": int(omit.text.strip().split(" ")[1]) if omit else 0 + }) + + for form in soup.find_all("form"): + next_button = form.find("input", attrs={"value": "Далее"}) + if next_button and form.get("action") != "none": + return threads, urljoin(self._base_url, form.get("action")) + return threads, None + + @staticmethod + def parse_thread(r): + soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'), "html.parser") + + thread_el = soup.find("div", id=lambda x: x and re.match("thread-[0-9]+", x)) + + tid = int(re.search("thread-([0-9]+)[a-zA-Z]*", thread_el.get("id")).group(1)) + for post_el in thread_el.find_all("table", recursive=False): + *_, time = post_el.find("label").children + yield { + "id": int(post_el.find("td", attrs={"class", "reply"}).get("id")[5:]), + "type": "post", + "html": str(post_el), + "time": _ts(time), + "parent": tid + } + post_el.decompose() + + *_, time = thread_el.find("label").children + yield { + "id": tid, + "type": "thread", + "html": str(thread_el), + "time": _ts(time) + } diff --git a/chan/lolnada_html.py b/chan/lolnada_html.py index b85a767..f6a3a91 100644 --- a/chan/lolnada_html.py +++ b/chan/lolnada_html.py @@ -62,16 +62,18 @@ class LolNadaHtmlChanHelper(ChanHelper): soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'), "html.parser") op_el = soup.find("div", class_="hilo") + tid = int(op_el.get("id")[5:]) for post_el in op_el.find_all("div", class_="post reply"): yield { "id": int(post_el.get("id")[6:]), "type": "post", "html": str(post_el), - "time": int(parser.parse(post_el.find("time").get("datetime")).timestamp()) + "time": int(parser.parse(post_el.find("time").get("datetime")).timestamp()), + "parent": tid } post_el.decompose() yield { - "id": int(op_el.get("id")[5:]), + "id": tid, "type": "thread", "html": str(op_el), "time": int(parser.parse(op_el.find("time").get("datetime")).timestamp()) diff --git a/chan/tgchan_html.py b/chan/tgchan_html.py index 205d49b..ae54295 100644 --- a/chan/tgchan_html.py +++ b/chan/tgchan_html.py @@ -33,20 +33,26 @@ class TgChanHtmlChanHelper(DesuChanHtmlChanHelper): op_el = soup.find("form", id="delform") + posts = [] for post_el in op_el.find_all("table", recursive=False): *_, time = post_el.find("label").children - yield { + posts.append({ "id": int(post_el.find("td", attrs={"class", "reply"}).get("id")[5:]), "type": "post", "html": str(post_el), "time": int(datetime.datetime.strptime(time, "\n\n%Y/%m/%d(%a)%H:%M\n").timestamp()) - } + }) post_el.decompose() *_, time = op_el.find("label").children + tid = int(op_el.find("a", attrs={"name": lambda x: x and x.isdigit()}).get("name")) yield { - "id": int(op_el.find("a", attrs={"name": lambda x: x and x.isdigit()}).get("name")), + "id": tid, "type": "thread", "html": str(op_el), "time": int(datetime.datetime.strptime(time, "\n\n%Y/%m/%d(%a)%H:%M\n").timestamp()) } + + for post in posts: + post["parent"] = tid + yield post diff --git a/chan/zerochan_html.py b/chan/zerochan_html.py index 2e52af9..c87bd7f 100644 --- a/chan/zerochan_html.py +++ b/chan/zerochan_html.py @@ -62,17 +62,19 @@ class ZerochanHtmlChanHelper(DoushioHtmlChanHelper): op_el = soup.find("section", attrs={"data-id": lambda x: x}) + tid = int(op_el.get("data-id")[1:]) for post_el in op_el.find_all("article", attrs={"data-id": lambda x: x}): yield { "id": int(post_el.get("data-id")), "type": "post", "html": str(post_el), "time": int(datetime.datetime.strptime(_ru_datefmt(post_el.find("time").text), - "%d %b %Y %H:%M").timestamp()) + "%d %b %Y %H:%M").timestamp()), + "parent": tid, } post_el.decompose() yield { - "id": int(op_el.get("data-id")[1:]), + "id": tid, "type": "thread", "html": str(op_el), "time": int(datetime.datetime.strptime(_ru_datefmt(op_el.find("time").text), diff --git a/post_process.py b/post_process.py index 7b88bd5..158b2c9 100644 --- a/post_process.py +++ b/post_process.py @@ -73,7 +73,7 @@ def image_meta(url, url_idx, web): def post_process(item, board, helper, web): - item["_v"] = 1.5 + item["_v"] = 1.6 item["_id"] = helper.item_unique_id(item, board) item["_board"] = board diff --git a/run.py b/run.py index 398387b..1a87491 100644 --- a/run.py +++ b/run.py @@ -15,6 +15,7 @@ from post_process import post_process from util import logger, Web MONITORING = True +BYPASS_RPS = False class ChanScanner: @@ -202,6 +203,9 @@ if __name__ == "__main__": chan = sys.argv[2] chan_helper = CHANS[chan] + if BYPASS_RPS: + chan_helper.rps = 10 + if MONITORING: monitoring.init() state = ChanState()