From 1f218056677b1832e7b12779054b137b81d297c5 Mon Sep 17 00:00:00 2001 From: simon Date: Fri, 22 Nov 2019 21:08:06 -0500 Subject: [PATCH] add 2chan --- chan/chan.py | 128 +++++++++++++++++++++++++++++++++++++++++++++- chan/chan2_jap.py | 127 +++++++++++++++++++++++++++++++++++++++++++++ run.py | 8 +-- 3 files changed, 259 insertions(+), 4 deletions(-) create mode 100644 chan/chan2_jap.py diff --git a/chan/chan.py b/chan/chan.py index 5bb1387..1774a68 100644 --- a/chan/chan.py +++ b/chan/chan.py @@ -1,4 +1,5 @@ from chan.alokal_json import AlokalJsonChanHelper +from chan.chan2_jap import Chan2Helper from chan.chan410_html import Chan410HtmlChanHelper from chan.chan7_html import Chan7HtmlChanHelper from chan.chanon_html import ChanonHtmlChanHelper @@ -460,7 +461,7 @@ CHANS = { "cl", "co", "ec", "es", "mx", "pe", "py", "uy", "ve", "d", "h", "o", "s", "sar", "scl", "sco", "ses", "smx", "spe", "sve", ), - rps=1/20 + rps=1 / 20 ), "sushigirl": JsonChanHelper( 31, @@ -499,4 +500,129 @@ CHANS = { ), rps=1 / 15 ), + "2chan": Chan2Helper( + 34, + "https://.2chan.net", + "https://.2chan.net", + "/res/", + "/src/", + ( + "1", # baseball + "12", # soccer + "25", # Mahjong + "26", # Horses + "27", # Cats, + "d", # Animals + "z", # Plant life + "w", # Insects + "49", # Aquatic life + "62", # Outdoor + "t", # Cooking + "20", # Sweets + "21", # ramen + "e", # vehicles + "j", # moto & scooters + "37", # Bicycles + "45", # Cameras + "48", # Consumer electronics + "r", # railroad + "img2", # 2-D + "b", # Nijura + "b", + "b", + "jun", + + "58", # ??? 二次元裏転載不可 + "59", # ??? 二次元裏転載可 + + "id", # 2-D ID + "23", # Speedgrapher + "18", # 2d-Live + "16", # 2-D Neta + "43", # 2-D Industry + + "74", # ??? FGO + "75", # ??? アイマス + "78", # ??? ウメハラ総合 + + "31", # Games + "28", # Net games + + "56", # ??? ソシャゲ + "60", # ??? 艦これ + "69", # ??? モアイ + "65", # ??? 刀剣乱舞 + "64", # ??? 占い + "66", # ??? ファッション + "67", # ??? 旅行 + "68", # ??? 子育て + + "webm", + + "71", # ??? そうだね + "82", # ??? 任天堂 + "61", # ??? ソニー + + "10", # Net characters + "34", # Narikiri + "11", # Original art + "14", # Original art flipside + "32", # Crossdressing + "15", # Bara + "7", # Yuri + "8", # Yaoi + "o", # 2-D Guro + "51", # 2-D Guro flipside + "5", # Erotic games + "3", # Homebrew PC + "g", # Tokusatsu + "2", # Robot manga and anime + + "63", # 映画 + + "44", # Toys + "v", # Models + "y", # Models flipside nov + "47", # Models flipside jun + "46", # Figures + "73", # VTuber + "81", # 合成音声 + + "x", # 3DCG + "35", # Politics + "36", # Economics + "79", # Economics + "38", # Korean economics + + "80", # ??? 安倍晋三 + "50", # ??? 三次実況 + + "f", # Military + "39", # Military flipside + "m", # Mathematics + "i", # Flash + "k", # Wallpaper + "l", # 2D Wallpaper + "40", # Touhou + + "55", # ??? 東方裏 + + "p", # Oekaki + "q", # Rakugaki + "u", # Rakugaki flipside + "6", # News desk + "76", # ??? 昭和 + "77", # ??? 平成 + "9", # Idle chat + "52", # Great tohoku Earthquake of 2011 + "53", # Nuclear power + "70", # ??? 新板提案 + "54", # IPv6 + "layout", + + "oe", # ??? お絵sql + "72", # ??? お絵sqlip + ), + rps=1 / 3 + ), } diff --git a/chan/chan2_jap.py b/chan/chan2_jap.py new file mode 100644 index 0000000..e206ed7 --- /dev/null +++ b/chan/chan2_jap.py @@ -0,0 +1,127 @@ +import datetime +from urllib.parse import urljoin + +from bs4 import BeautifulSoup +from hexlib.misc import strhash, signed64 + +from chan.helper import ChanHelper +from post_process import get_links_from_html_body +import re + +SUBDOMAIN_PATTERN = re.compile("<([a-z]{3})>") + +TIME_PATTERN = re.compile(r"([0-9]{2}/[0-9]{2}/[0-9]{2}\(.\)[0-9]{2}:[0-9]{2}:[0-9]{2})") + + +def _ja_datefmt(text): + return re.sub(r"\(.\)", " ", text) + + +class Chan2Helper(ChanHelper): + + def _subdomain(self, board): + m = SUBDOMAIN_PATTERN.search(board) + if m: + return m.group(1) + return "www" + + def _trim(self, board): + return SUBDOMAIN_PATTERN.sub("", board) + + def threads_url(self, board): + return "%s/%s/" % (self._base_url.replace("", self._subdomain(board)), self._trim(board)) + + def posts_url(self, board, thread): + return "%s/%s%s%d.htm" % (self._base_url.replace("", self._subdomain(board)), self._trim(board), self._thread_path, + self.item_id(thread)) + + @staticmethod + def item_id(item): + return item["id"] + + def item_urls(self, item, board): + return [url for url in + set(get_links_from_html_body(item["html"], self._base_url.replace("", self._subdomain(board)))) + if "javascript" not in url + ] + + @staticmethod + def item_type(item): + return item["type"] + + @staticmethod + def thread_mtime(thread): + return thread["omit"] + + @staticmethod + def item_mtime(item): + return item["time"] + + def parse_threads_list(self, r): + soup = BeautifulSoup(r.content.decode('Shift_JIS', 'ignore'), "html.parser") + + threads = [] + + for threadEl in soup.find_all("div", class_="thre"): + omit = threadEl.find("font", color="#707070") + # Example: レス9件省略。全て読むには返信ボタンを押してください。 + + threads.append({ + "id": int(threadEl.get("data-res")), + "omit": signed64(strhash(omit.text)) if omit else 0 + }) + + # for btn in soup.find_all("input"): + # if btn.get("value") == "次のページ": + # return threads, urljoin(r.url, btn.parent.get("action")) + return threads, None + + @staticmethod + def parse_thread(r): + soup = BeautifulSoup(r.content.decode('Shift_JIS', 'ignore'), "html.parser") + + op_el = soup.find("div", class_="thre") + tid = int(op_el.get("data-res")) + + for post_el in op_el.find_all("table", recursive=False): + + cnw = post_el.find("span", class_="cnw") + if cnw: + time = cnw.text.split(" ")[0] + else: + time = TIME_PATTERN.search(post_el.text).group(1) + + sod = post_el.find("a", id=lambda x: x and x[2:].isnumeric()) + if sod: + # www + id_str = sod.get("id")[2:] + else: + # may + inputEl = post_el.find("input") + if inputEl: + id_str = inputEl.get("name") + else: + id_str = post_el.find("span", id=lambda x: x).get("id")[len("delcheck"):] + + yield { + "id": int(id_str), + "type": "post", + "html": str(post_el), + "time": int(datetime.datetime.strptime(_ja_datefmt(time), "%y/%m/%d %H:%M:%S").timestamp()), + "parent": tid + } + post_el.decompose() + + cnw = op_el.find("span", class_="cnw") + if cnw: + # www + time = cnw.text.split(" ")[0] + else: + # may + time = TIME_PATTERN.search(op_el.text).group(1) + yield { + "id": tid, + "type": "thread", + "html": str(op_el), + "time": int(datetime.datetime.strptime(_ja_datefmt(time), "%y/%m/%d %H:%M:%S").timestamp()), + } diff --git a/run.py b/run.py index 99afbf3..38f57d3 100644 --- a/run.py +++ b/run.py @@ -2,6 +2,7 @@ import datetime import json import sqlite3 import sys +import time import traceback from datetime import datetime from queue import Queue @@ -118,11 +119,11 @@ class ChanState: with sqlite3.connect(self._db, timeout=5000) as conn: cur = conn.cursor() cur.execute( - "SELECT last_modified FROM threads WHERE thread=? AND chan=?", + "SELECT last_modified, ts FROM threads WHERE thread=? AND chan=?", (helper.item_unique_id(thread, board), helper.db_id) ) row = cur.fetchone() - if not row or helper.thread_mtime(thread) != row[0]: + if not row or helper.thread_mtime(thread) != row[0] or row[1] + 86400 < int(time.time()): return True return False @@ -132,7 +133,7 @@ class ChanState: "INSERT INTO threads (thread, last_modified, chan) " "VALUES (?,?,?) " "ON CONFLICT (thread, chan) " - "DO UPDATE SET last_modified=?", + "DO UPDATE SET last_modified=?, ts=(strftime('%s','now'))", (helper.item_unique_id(thread, board), helper.thread_mtime(thread), helper.db_id, helper.thread_mtime(thread)) ) @@ -183,6 +184,7 @@ def publish(item, board, helper, channel, web): except Exception as e: logger.debug(traceback.format_exc()) logger.error(str(e)) + time.sleep(0.5) channel = connect()