add 2chan

This commit is contained in:
simon 2019-11-22 21:08:06 -05:00
parent ee666496e0
commit 1f21805667
3 changed files with 259 additions and 4 deletions

View File

@ -1,4 +1,5 @@
from chan.alokal_json import AlokalJsonChanHelper
from chan.chan2_jap import Chan2Helper
from chan.chan410_html import Chan410HtmlChanHelper
from chan.chan7_html import Chan7HtmlChanHelper
from chan.chanon_html import ChanonHtmlChanHelper
@ -460,7 +461,7 @@ CHANS = {
"cl", "co", "ec", "es", "mx", "pe", "py", "uy", "ve", "d",
"h", "o", "s", "sar", "scl", "sco", "ses", "smx", "spe", "sve",
),
rps=1/20
rps=1 / 20
),
"sushigirl": JsonChanHelper(
31,
@ -499,4 +500,129 @@ CHANS = {
),
rps=1 / 15
),
"2chan": Chan2Helper(
34,
"https://<sub>.2chan.net",
"https://<sub>.2chan.net",
"/res/",
"/src/",
(
"1", # baseball
"12", # soccer
"25<may>", # Mahjong
"26<may>", # Horses
"27<may>", # Cats,
"d", # Animals
"z", # Plant life
"w", # Insects
"49", # Aquatic life
"62<dec>", # Outdoor
"t", # Cooking
"20", # Sweets
"21", # ramen
"e", # vehicles
"j", # moto & scooters
"37<nov>", # Bicycles
"45", # Cameras
"48", # Consumer electronics
"r", # railroad
"img2", # 2-D
"b<dec>", # Nijura
"b<may>",
"b<jun>",
"jun<jun>",
"58<dec>", # ??? 二次元裏転載不可
"59<dec>", # ??? 二次元裏転載可
"id<may>", # 2-D ID
"23", # Speedgrapher
"18<dec>", # 2d-Live
"16", # 2-D Neta
"43", # 2-D Industry
"74<dec>", # ??? FGO
"75<dec>", # ??? アイマス
"78<dec>", # ??? ウメハラ総合
"31<jun>", # Games
"28<nov>", # Net games
"56<dec>", # ??? ソシャゲ
"60<dec>", # ??? 艦これ
"69<dec>", # ??? モアイ
"65<dec>", # ??? 刀剣乱舞
"64<dec>", # ??? 占い
"66<dec>", # ??? ファッション
"67<dec>", # ??? 旅行
"68<dec>", # ??? 子育て
"webm<may>",
"71<dec>", # ??? そうだね
"82<dec>", # ??? 任天堂
"61<dec>", # ??? ソニー
"10", # Net characters
"34<nov>", # Narikiri
"11", # Original art
"14", # Original art flipside
"32", # Crossdressing
"15", # Bara
"7", # Yuri
"8", # Yaoi
"o", # 2-D Guro
"51", # 2-D Guro flipside
"5", # Erotic games
"3", # Homebrew PC
"g", # Tokusatsu
"2", # Robot manga and anime
"63<dec>", # 映画
"44", # Toys
"v", # Models
"y<nov>", # Models flipside nov
"47", # Models flipside jun
"46", # Figures
"73<dec>", # VTuber
"81<dec>", # 合成音声
"x", # 3DCG
"35<nov>", # Politics
"36<nov>", # Economics
"79<dec>", # Economics
"38", # Korean economics
"80<dec>", # ??? 安倍晋三
"50<dec>", # ??? 三次実況
"f", # Military
"39<may>", # Military flipside
"m", # Mathematics
"i", # Flash
"k", # Wallpaper
"l", # 2D Wallpaper
"40<may>", # Touhou
"55<dec>", # ??? 東方裏
"p", # Oekaki
"q<nov>", # Rakugaki
"u", # Rakugaki flipside
"6", # News desk
"76<dec>", # ??? 昭和
"77<dec>", # ??? 平成
"9<img>", # Idle chat
"52", # Great tohoku Earthquake of 2011
"53", # Nuclear power
"70<dec>", # ??? 新板提案
"54", # IPv6
"layout<may>",
"oe", # ??? お絵sql
"72", # ??? お絵sqlip
),
rps=1 / 3
),
}

127
chan/chan2_jap.py Normal file
View File

@ -0,0 +1,127 @@
import datetime
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from hexlib.misc import strhash, signed64
from chan.helper import ChanHelper
from post_process import get_links_from_html_body
import re
SUBDOMAIN_PATTERN = re.compile("<([a-z]{3})>")
TIME_PATTERN = re.compile(r"([0-9]{2}/[0-9]{2}/[0-9]{2}\(.\)[0-9]{2}:[0-9]{2}:[0-9]{2})")
def _ja_datefmt(text):
return re.sub(r"\(.\)", " ", text)
class Chan2Helper(ChanHelper):
def _subdomain(self, board):
m = SUBDOMAIN_PATTERN.search(board)
if m:
return m.group(1)
return "www"
def _trim(self, board):
return SUBDOMAIN_PATTERN.sub("", board)
def threads_url(self, board):
return "%s/%s/" % (self._base_url.replace("<sub>", self._subdomain(board)), self._trim(board))
def posts_url(self, board, thread):
return "%s/%s%s%d.htm" % (self._base_url.replace("<sub>", self._subdomain(board)), self._trim(board), self._thread_path,
self.item_id(thread))
@staticmethod
def item_id(item):
return item["id"]
def item_urls(self, item, board):
return [url for url in
set(get_links_from_html_body(item["html"], self._base_url.replace("<sub>", self._subdomain(board))))
if "javascript" not in url
]
@staticmethod
def item_type(item):
return item["type"]
@staticmethod
def thread_mtime(thread):
return thread["omit"]
@staticmethod
def item_mtime(item):
return item["time"]
def parse_threads_list(self, r):
soup = BeautifulSoup(r.content.decode('Shift_JIS', 'ignore'), "html.parser")
threads = []
for threadEl in soup.find_all("div", class_="thre"):
omit = threadEl.find("font", color="#707070")
# Example: <font color="#707070">レス9件省略。全て読むには返信ボタンを押してください。</font>
threads.append({
"id": int(threadEl.get("data-res")),
"omit": signed64(strhash(omit.text)) if omit else 0
})
# for btn in soup.find_all("input"):
# if btn.get("value") == "次のページ":
# return threads, urljoin(r.url, btn.parent.get("action"))
return threads, None
@staticmethod
def parse_thread(r):
soup = BeautifulSoup(r.content.decode('Shift_JIS', 'ignore'), "html.parser")
op_el = soup.find("div", class_="thre")
tid = int(op_el.get("data-res"))
for post_el in op_el.find_all("table", recursive=False):
cnw = post_el.find("span", class_="cnw")
if cnw:
time = cnw.text.split(" ")[0]
else:
time = TIME_PATTERN.search(post_el.text).group(1)
sod = post_el.find("a", id=lambda x: x and x[2:].isnumeric())
if sod:
# www
id_str = sod.get("id")[2:]
else:
# may
inputEl = post_el.find("input")
if inputEl:
id_str = inputEl.get("name")
else:
id_str = post_el.find("span", id=lambda x: x).get("id")[len("delcheck"):]
yield {
"id": int(id_str),
"type": "post",
"html": str(post_el),
"time": int(datetime.datetime.strptime(_ja_datefmt(time), "%y/%m/%d %H:%M:%S").timestamp()),
"parent": tid
}
post_el.decompose()
cnw = op_el.find("span", class_="cnw")
if cnw:
# www
time = cnw.text.split(" ")[0]
else:
# may
time = TIME_PATTERN.search(op_el.text).group(1)
yield {
"id": tid,
"type": "thread",
"html": str(op_el),
"time": int(datetime.datetime.strptime(_ja_datefmt(time), "%y/%m/%d %H:%M:%S").timestamp()),
}

8
run.py
View File

@ -2,6 +2,7 @@ import datetime
import json
import sqlite3
import sys
import time
import traceback
from datetime import datetime
from queue import Queue
@ -118,11 +119,11 @@ class ChanState:
with sqlite3.connect(self._db, timeout=5000) as conn:
cur = conn.cursor()
cur.execute(
"SELECT last_modified FROM threads WHERE thread=? AND chan=?",
"SELECT last_modified, ts FROM threads WHERE thread=? AND chan=?",
(helper.item_unique_id(thread, board), helper.db_id)
)
row = cur.fetchone()
if not row or helper.thread_mtime(thread) != row[0]:
if not row or helper.thread_mtime(thread) != row[0] or row[1] + 86400 < int(time.time()):
return True
return False
@ -132,7 +133,7 @@ class ChanState:
"INSERT INTO threads (thread, last_modified, chan) "
"VALUES (?,?,?) "
"ON CONFLICT (thread, chan) "
"DO UPDATE SET last_modified=?",
"DO UPDATE SET last_modified=?, ts=(strftime('%s','now'))",
(helper.item_unique_id(thread, board), helper.thread_mtime(thread), helper.db_id,
helper.thread_mtime(thread))
)
@ -183,6 +184,7 @@ def publish(item, board, helper, channel, web):
except Exception as e:
logger.debug(traceback.format_exc())
logger.error(str(e))
time.sleep(0.5)
channel = connect()