add 2chan

This commit is contained in:
simon 2019-11-22 21:08:06 -05:00
parent ee666496e0
commit 1f21805667
3 changed files with 259 additions and 4 deletions

View File

@ -1,4 +1,5 @@
from chan.alokal_json import AlokalJsonChanHelper from chan.alokal_json import AlokalJsonChanHelper
from chan.chan2_jap import Chan2Helper
from chan.chan410_html import Chan410HtmlChanHelper from chan.chan410_html import Chan410HtmlChanHelper
from chan.chan7_html import Chan7HtmlChanHelper from chan.chan7_html import Chan7HtmlChanHelper
from chan.chanon_html import ChanonHtmlChanHelper from chan.chanon_html import ChanonHtmlChanHelper
@ -460,7 +461,7 @@ CHANS = {
"cl", "co", "ec", "es", "mx", "pe", "py", "uy", "ve", "d", "cl", "co", "ec", "es", "mx", "pe", "py", "uy", "ve", "d",
"h", "o", "s", "sar", "scl", "sco", "ses", "smx", "spe", "sve", "h", "o", "s", "sar", "scl", "sco", "ses", "smx", "spe", "sve",
), ),
rps=1/20 rps=1 / 20
), ),
"sushigirl": JsonChanHelper( "sushigirl": JsonChanHelper(
31, 31,
@ -499,4 +500,129 @@ CHANS = {
), ),
rps=1 / 15 rps=1 / 15
), ),
"2chan": Chan2Helper(
34,
"https://<sub>.2chan.net",
"https://<sub>.2chan.net",
"/res/",
"/src/",
(
"1", # baseball
"12", # soccer
"25<may>", # Mahjong
"26<may>", # Horses
"27<may>", # Cats,
"d", # Animals
"z", # Plant life
"w", # Insects
"49", # Aquatic life
"62<dec>", # Outdoor
"t", # Cooking
"20", # Sweets
"21", # ramen
"e", # vehicles
"j", # moto & scooters
"37<nov>", # Bicycles
"45", # Cameras
"48", # Consumer electronics
"r", # railroad
"img2", # 2-D
"b<dec>", # Nijura
"b<may>",
"b<jun>",
"jun<jun>",
"58<dec>", # ??? 二次元裏転載不可
"59<dec>", # ??? 二次元裏転載可
"id<may>", # 2-D ID
"23", # Speedgrapher
"18<dec>", # 2d-Live
"16", # 2-D Neta
"43", # 2-D Industry
"74<dec>", # ??? FGO
"75<dec>", # ??? アイマス
"78<dec>", # ??? ウメハラ総合
"31<jun>", # Games
"28<nov>", # Net games
"56<dec>", # ??? ソシャゲ
"60<dec>", # ??? 艦これ
"69<dec>", # ??? モアイ
"65<dec>", # ??? 刀剣乱舞
"64<dec>", # ??? 占い
"66<dec>", # ??? ファッション
"67<dec>", # ??? 旅行
"68<dec>", # ??? 子育て
"webm<may>",
"71<dec>", # ??? そうだね
"82<dec>", # ??? 任天堂
"61<dec>", # ??? ソニー
"10", # Net characters
"34<nov>", # Narikiri
"11", # Original art
"14", # Original art flipside
"32", # Crossdressing
"15", # Bara
"7", # Yuri
"8", # Yaoi
"o", # 2-D Guro
"51", # 2-D Guro flipside
"5", # Erotic games
"3", # Homebrew PC
"g", # Tokusatsu
"2", # Robot manga and anime
"63<dec>", # 映画
"44", # Toys
"v", # Models
"y<nov>", # Models flipside nov
"47", # Models flipside jun
"46", # Figures
"73<dec>", # VTuber
"81<dec>", # 合成音声
"x", # 3DCG
"35<nov>", # Politics
"36<nov>", # Economics
"79<dec>", # Economics
"38", # Korean economics
"80<dec>", # ??? 安倍晋三
"50<dec>", # ??? 三次実況
"f", # Military
"39<may>", # Military flipside
"m", # Mathematics
"i", # Flash
"k", # Wallpaper
"l", # 2D Wallpaper
"40<may>", # Touhou
"55<dec>", # ??? 東方裏
"p", # Oekaki
"q<nov>", # Rakugaki
"u", # Rakugaki flipside
"6", # News desk
"76<dec>", # ??? 昭和
"77<dec>", # ??? 平成
"9<img>", # Idle chat
"52", # Great tohoku Earthquake of 2011
"53", # Nuclear power
"70<dec>", # ??? 新板提案
"54", # IPv6
"layout<may>",
"oe", # ??? お絵sql
"72", # ??? お絵sqlip
),
rps=1 / 3
),
} }

127
chan/chan2_jap.py Normal file
View File

@ -0,0 +1,127 @@
import datetime
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from hexlib.misc import strhash, signed64
from chan.helper import ChanHelper
from post_process import get_links_from_html_body
import re
SUBDOMAIN_PATTERN = re.compile("<([a-z]{3})>")
TIME_PATTERN = re.compile(r"([0-9]{2}/[0-9]{2}/[0-9]{2}\(.\)[0-9]{2}:[0-9]{2}:[0-9]{2})")
def _ja_datefmt(text):
return re.sub(r"\(.\)", " ", text)
class Chan2Helper(ChanHelper):
def _subdomain(self, board):
m = SUBDOMAIN_PATTERN.search(board)
if m:
return m.group(1)
return "www"
def _trim(self, board):
return SUBDOMAIN_PATTERN.sub("", board)
def threads_url(self, board):
return "%s/%s/" % (self._base_url.replace("<sub>", self._subdomain(board)), self._trim(board))
def posts_url(self, board, thread):
return "%s/%s%s%d.htm" % (self._base_url.replace("<sub>", self._subdomain(board)), self._trim(board), self._thread_path,
self.item_id(thread))
@staticmethod
def item_id(item):
return item["id"]
def item_urls(self, item, board):
return [url for url in
set(get_links_from_html_body(item["html"], self._base_url.replace("<sub>", self._subdomain(board))))
if "javascript" not in url
]
@staticmethod
def item_type(item):
return item["type"]
@staticmethod
def thread_mtime(thread):
return thread["omit"]
@staticmethod
def item_mtime(item):
return item["time"]
def parse_threads_list(self, r):
soup = BeautifulSoup(r.content.decode('Shift_JIS', 'ignore'), "html.parser")
threads = []
for threadEl in soup.find_all("div", class_="thre"):
omit = threadEl.find("font", color="#707070")
# Example: <font color="#707070">レス9件省略。全て読むには返信ボタンを押してください。</font>
threads.append({
"id": int(threadEl.get("data-res")),
"omit": signed64(strhash(omit.text)) if omit else 0
})
# for btn in soup.find_all("input"):
# if btn.get("value") == "次のページ":
# return threads, urljoin(r.url, btn.parent.get("action"))
return threads, None
@staticmethod
def parse_thread(r):
soup = BeautifulSoup(r.content.decode('Shift_JIS', 'ignore'), "html.parser")
op_el = soup.find("div", class_="thre")
tid = int(op_el.get("data-res"))
for post_el in op_el.find_all("table", recursive=False):
cnw = post_el.find("span", class_="cnw")
if cnw:
time = cnw.text.split(" ")[0]
else:
time = TIME_PATTERN.search(post_el.text).group(1)
sod = post_el.find("a", id=lambda x: x and x[2:].isnumeric())
if sod:
# www
id_str = sod.get("id")[2:]
else:
# may
inputEl = post_el.find("input")
if inputEl:
id_str = inputEl.get("name")
else:
id_str = post_el.find("span", id=lambda x: x).get("id")[len("delcheck"):]
yield {
"id": int(id_str),
"type": "post",
"html": str(post_el),
"time": int(datetime.datetime.strptime(_ja_datefmt(time), "%y/%m/%d %H:%M:%S").timestamp()),
"parent": tid
}
post_el.decompose()
cnw = op_el.find("span", class_="cnw")
if cnw:
# www
time = cnw.text.split(" ")[0]
else:
# may
time = TIME_PATTERN.search(op_el.text).group(1)
yield {
"id": tid,
"type": "thread",
"html": str(op_el),
"time": int(datetime.datetime.strptime(_ja_datefmt(time), "%y/%m/%d %H:%M:%S").timestamp()),
}

8
run.py
View File

@ -2,6 +2,7 @@ import datetime
import json import json
import sqlite3 import sqlite3
import sys import sys
import time
import traceback import traceback
from datetime import datetime from datetime import datetime
from queue import Queue from queue import Queue
@ -118,11 +119,11 @@ class ChanState:
with sqlite3.connect(self._db, timeout=5000) as conn: with sqlite3.connect(self._db, timeout=5000) as conn:
cur = conn.cursor() cur = conn.cursor()
cur.execute( cur.execute(
"SELECT last_modified FROM threads WHERE thread=? AND chan=?", "SELECT last_modified, ts FROM threads WHERE thread=? AND chan=?",
(helper.item_unique_id(thread, board), helper.db_id) (helper.item_unique_id(thread, board), helper.db_id)
) )
row = cur.fetchone() row = cur.fetchone()
if not row or helper.thread_mtime(thread) != row[0]: if not row or helper.thread_mtime(thread) != row[0] or row[1] + 86400 < int(time.time()):
return True return True
return False return False
@ -132,7 +133,7 @@ class ChanState:
"INSERT INTO threads (thread, last_modified, chan) " "INSERT INTO threads (thread, last_modified, chan) "
"VALUES (?,?,?) " "VALUES (?,?,?) "
"ON CONFLICT (thread, chan) " "ON CONFLICT (thread, chan) "
"DO UPDATE SET last_modified=?", "DO UPDATE SET last_modified=?, ts=(strftime('%s','now'))",
(helper.item_unique_id(thread, board), helper.thread_mtime(thread), helper.db_id, (helper.item_unique_id(thread, board), helper.thread_mtime(thread), helper.db_id,
helper.thread_mtime(thread)) helper.thread_mtime(thread))
) )
@ -183,6 +184,7 @@ def publish(item, board, helper, channel, web):
except Exception as e: except Exception as e:
logger.debug(traceback.format_exc()) logger.debug(traceback.format_exc())
logger.error(str(e)) logger.error(str(e))
time.sleep(0.5)
channel = connect() channel = connect()