Link to parent, version bump, add iichan

This commit is contained in:
simon 2019-09-14 11:10:09 -04:00
parent 8ef56871a9
commit 828a14ee98
14 changed files with 156 additions and 24 deletions

View File

@ -6,6 +6,7 @@ from chan.desuchan_html import DesuChanHtmlChanHelper
from chan.doushio_html import DoushioHtmlChanHelper
from chan.endchan_html import EndchanHtmlChanHelper
from chan.fchan_html import FChanHtmlChanHelper
from chan.iichan_html import IichanHtmlChanHelper
from chan.json import JsonChanHelper
from chan.lolnada_html import LolNadaHtmlChanHelper
from chan.mayuri import MayuriChanHelper
@ -350,5 +351,20 @@ CHANS = {
"b", "goys"
),
rps=1 / 60
),
"iichan": IichanHtmlChanHelper(
27,
"https://iichan.hk/",
"https://iichan.hk/",
"/res/",
"/src/",
(
"d", "b", "bro", "ci", "cu", "dev", "gf", "hr", "l",
"m", "med", "mi", "mu", "o", "ph", "r", "s", "sci",
"tran", "tu", "tv", "x", "es", "vq", "au", "tr", "a",
"aa", "abe", "c", "fi", "jp", "rm", "tan", "to", "ts",
"vn", "vo", "misc"
),
rps=1 / 10
)
}

View File

@ -32,20 +32,27 @@ class Chan410HtmlChanHelper(DesuChanHtmlChanHelper):
op_el = soup.find("form", id="delform")
posts = []
for post_el in op_el.find_all("div", class_="reply"):
yield {
posts.append({
"id": int(post_el.get("id")[5:]),
"type": "post",
"html": str(post_el),
"time": int(datetime.datetime.strptime(_ru_datefmt(op_el.find("span", class_="time").text),
"%d.%m.%Y %H:%M:%S").timestamp())
}
})
post_el.decompose()
tid = int(op_el.find("a", attrs={"name": lambda x: x and x.isdigit()}).get("name"))
yield {
"id": int(op_el.find("a", attrs={"name": lambda x: x and x.isdigit()}).get("name")),
"id": tid,
"type": "thread",
"html": str(op_el),
"time": int(datetime.datetime.strptime(_ru_datefmt(op_el.find("span", class_="time").text),
"%d.%m.%Y %H:%M:%S").timestamp())
}
for post in posts:
post["parent"] = tid
yield post

View File

@ -38,8 +38,9 @@ class Chan7HtmlChanHelper(DesuChanHtmlChanHelper):
thread_el = soup.find("div", id=lambda x: x and re.match("thread_[0-9]+_[a-zA-Z]*", x))
op_el = thread_el.find("div", class_="post")
time = "".join(s for s in op_el.find("div", class_="post_header").contents if isinstance(s, str))
tid = int(op_el.get("id"))
yield {
"id": int(op_el.get("id")),
"id": tid,
"type": "thread",
"html": str(op_el),
"time": int(datetime.datetime.strptime(_trim_time(time), "\n%y/%m/%d(%a)%H:%M\n").timestamp())
@ -51,5 +52,6 @@ class Chan7HtmlChanHelper(DesuChanHtmlChanHelper):
"id": int(post_el.get("id")[6:]),
"type": "post",
"html": str(post_el),
"time": int(datetime.datetime.strptime(_trim_time(time), "\n%y/%m/%d(%a)%H:%M\n").timestamp())
"time": int(datetime.datetime.strptime(_trim_time(time), "\n%y/%m/%d(%a)%H:%M\n").timestamp()),
"parent": tid
}

View File

@ -44,19 +44,21 @@ class ChanonHtmlChanHelper(DesuChanHtmlChanHelper):
thread_el = soup.find("div", id=lambda x: x and re.match("thread[0-9]+[a-zA-Z]*", x))
tid = int(re.search("thread([0-9]+)[a-zA-Z]*", thread_el.get("id")).group(1))
for post_el in thread_el.find_all("table", recursive=False):
*_, time = post_el.find("label").children
yield {
"id": int(post_el.find("td", attrs={"class", "reply"}).get("id")[5:]),
"type": "post",
"html": str(post_el),
"time": _ts(time, r)
"time": _ts(time, r),
"parent": tid,
}
post_el.decompose()
*_, time = thread_el.find("label").children
yield {
"id": int(re.search("thread([0-9]+)[a-zA-Z]*", thread_el.get("id")).group(1)),
"id": tid,
"type": "thread",
"html": str(thread_el),
"time": _ts(time, r)

View File

@ -57,19 +57,21 @@ class DesuChanHtmlChanHelper(ChanHelper):
op_el = soup.find("div", id=lambda tid: tid and tid[1:].isdigit())
tid = int(op_el.get("id")[1:])
for post_el in op_el.find_all("table", recursive=False):
*_, time = post_el.find("label").children
yield {
"id": int(post_el.find("td", attrs={"class", "reply"}).get("id")[5:]),
"type": "post",
"html": str(post_el),
"time": int(datetime.datetime.strptime(time, "\n%y/%m/%d(%a)%H:%M").timestamp())
"time": int(datetime.datetime.strptime(time, "\n%y/%m/%d(%a)%H:%M").timestamp()),
"parent": tid
}
post_el.decompose()
*_, time = op_el.find("label").children
yield {
"id": int(op_el.get("id")[1:]),
"id": tid,
"type": "thread",
"html": str(op_el),
"time": int(datetime.datetime.strptime(time, "\n%y/%m/%d(%a)%H:%M").timestamp())

View File

@ -56,16 +56,18 @@ class DoushioHtmlChanHelper(ChanHelper):
soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'), "html.parser")
op_el = soup.find("section")
tid = int(op_el.get("id"))
for post_el in op_el.find_all("article"):
yield {
"id": int(post_el.get("id")),
"type": "post",
"html": str(post_el),
"time": int(parser.parse(post_el.find("header").find("time").get("datetime")).timestamp())
"time": int(parser.parse(post_el.find("header").find("time").get("datetime")).timestamp()),
"parent": tid
}
post_el.decompose()
yield {
"id": int(op_el.get("id")),
"id": tid,
"type": "thread",
"html": str(op_el),
"time": int(parser.parse(op_el.find("header").find("time").get("datetime")).timestamp())

View File

@ -58,8 +58,9 @@ class EndchanHtmlChanHelper(ChanHelper):
op_el = soup.find("div", attrs={"class": "innerOP"})
if not op_el:
return []
tid = int(soup.find("div", class_="opCell").get("id"))
yield {
"id": int(soup.find("div", class_="opCell").get("id")),
"id": tid,
"type": "thread",
"html": str(op_el),
"time": int(datetime.datetime.strptime(op_el.find("span", class_="labelCreated").text,
@ -72,5 +73,6 @@ class EndchanHtmlChanHelper(ChanHelper):
"type": "post",
"html": str(post_el),
"time": int(datetime.datetime.strptime(post_el.find("span", class_="labelCreated").text,
"%m/%d/%Y (%a) %H:%M:%S").timestamp())
"%m/%d/%Y (%a) %H:%M:%S").timestamp()),
"parent": tid
}

View File

@ -34,22 +34,29 @@ class FChanHtmlChanHelper(DesuChanHtmlChanHelper):
is_op = True
posts = []
tid = None
for post_el in op_el.find_all("table", recursive=False):
label = post_el.find("label")
*_, time = label.children
if is_op:
tid = int(op_el.get("id")[6:])
yield {
"id": int(op_el.get("id")[6:]),
"id": tid,
"type": "thread",
"html": str(post_el),
"time": int(datetime.datetime.strptime(time.strip(), "%y/%m/%d(%a)%H:%M").timestamp())
}
is_op = False
else:
yield {
posts.append({
"id": int(post_el.find("td", class_=lambda x: x and "reply" in x).get("id")[5:]),
"type": "post",
"html": str(post_el),
"time": int(datetime.datetime.strptime(time.strip(), "%y/%m/%d(%a)%H:%M").timestamp())
}
})
for post in posts:
post["parent"] = tid
yield post

78
chan/iichan_html.py Normal file
View File

@ -0,0 +1,78 @@
import datetime
import re
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from chan.desuchan_html import DesuChanHtmlChanHelper
from util import logger
def _ts(text):
time = re.sub(r"^\w{2} ", "", text.strip()) \
.replace("января", "01") \
.replace("февраля", "02") \
.replace("марта", "03") \
.replace("апреля", "04") \
.replace("мая", "05") \
.replace("июня", "06") \
.replace("июля", "07") \
.replace("августа", "08") \
.replace("сентября", "09") \
.replace("октября", "10") \
.replace("ноября", "11") \
.replace("декабря", "12") \
.replace("", "9")
# For some reason, some dates are fuzzed / in chinese
try:
return int(datetime.datetime.strptime(time, "%d %m %Y %H:%M:%S").timestamp())
except Exception as e:
logger.warning("Error during date parsing (iichan): " + str(e))
return 0
class IichanHtmlChanHelper(DesuChanHtmlChanHelper):
def parse_threads_list(self, r):
soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'), "html.parser")
threads = []
for threadEl in soup.find_all("div", id=lambda tid: tid and re.match("thread-([0-9]+)$", tid)):
omit = threadEl.find("span", class_="omittedposts")
threads.append({
"id": int(re.search("thread-([0-9]+)", threadEl.get("id")).group(1)),
"omit": int(omit.text.strip().split(" ")[1]) if omit else 0
})
for form in soup.find_all("form"):
next_button = form.find("input", attrs={"value": "Далее"})
if next_button and form.get("action") != "none":
return threads, urljoin(self._base_url, form.get("action"))
return threads, None
@staticmethod
def parse_thread(r):
soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'), "html.parser")
thread_el = soup.find("div", id=lambda x: x and re.match("thread-[0-9]+", x))
tid = int(re.search("thread-([0-9]+)[a-zA-Z]*", thread_el.get("id")).group(1))
for post_el in thread_el.find_all("table", recursive=False):
*_, time = post_el.find("label").children
yield {
"id": int(post_el.find("td", attrs={"class", "reply"}).get("id")[5:]),
"type": "post",
"html": str(post_el),
"time": _ts(time),
"parent": tid
}
post_el.decompose()
*_, time = thread_el.find("label").children
yield {
"id": tid,
"type": "thread",
"html": str(thread_el),
"time": _ts(time)
}

View File

@ -62,16 +62,18 @@ class LolNadaHtmlChanHelper(ChanHelper):
soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'), "html.parser")
op_el = soup.find("div", class_="hilo")
tid = int(op_el.get("id")[5:])
for post_el in op_el.find_all("div", class_="post reply"):
yield {
"id": int(post_el.get("id")[6:]),
"type": "post",
"html": str(post_el),
"time": int(parser.parse(post_el.find("time").get("datetime")).timestamp())
"time": int(parser.parse(post_el.find("time").get("datetime")).timestamp()),
"parent": tid
}
post_el.decompose()
yield {
"id": int(op_el.get("id")[5:]),
"id": tid,
"type": "thread",
"html": str(op_el),
"time": int(parser.parse(op_el.find("time").get("datetime")).timestamp())

View File

@ -33,20 +33,26 @@ class TgChanHtmlChanHelper(DesuChanHtmlChanHelper):
op_el = soup.find("form", id="delform")
posts = []
for post_el in op_el.find_all("table", recursive=False):
*_, time = post_el.find("label").children
yield {
posts.append({
"id": int(post_el.find("td", attrs={"class", "reply"}).get("id")[5:]),
"type": "post",
"html": str(post_el),
"time": int(datetime.datetime.strptime(time, "\n\n%Y/%m/%d(%a)%H:%M\n").timestamp())
}
})
post_el.decompose()
*_, time = op_el.find("label").children
tid = int(op_el.find("a", attrs={"name": lambda x: x and x.isdigit()}).get("name"))
yield {
"id": int(op_el.find("a", attrs={"name": lambda x: x and x.isdigit()}).get("name")),
"id": tid,
"type": "thread",
"html": str(op_el),
"time": int(datetime.datetime.strptime(time, "\n\n%Y/%m/%d(%a)%H:%M\n").timestamp())
}
for post in posts:
post["parent"] = tid
yield post

View File

@ -62,17 +62,19 @@ class ZerochanHtmlChanHelper(DoushioHtmlChanHelper):
op_el = soup.find("section", attrs={"data-id": lambda x: x})
tid = int(op_el.get("data-id")[1:])
for post_el in op_el.find_all("article", attrs={"data-id": lambda x: x}):
yield {
"id": int(post_el.get("data-id")),
"type": "post",
"html": str(post_el),
"time": int(datetime.datetime.strptime(_ru_datefmt(post_el.find("time").text),
"%d %b %Y %H:%M").timestamp())
"%d %b %Y %H:%M").timestamp()),
"parent": tid,
}
post_el.decompose()
yield {
"id": int(op_el.get("data-id")[1:]),
"id": tid,
"type": "thread",
"html": str(op_el),
"time": int(datetime.datetime.strptime(_ru_datefmt(op_el.find("time").text),

View File

@ -73,7 +73,7 @@ def image_meta(url, url_idx, web):
def post_process(item, board, helper, web):
item["_v"] = 1.5
item["_v"] = 1.6
item["_id"] = helper.item_unique_id(item, board)
item["_board"] = board

4
run.py
View File

@ -15,6 +15,7 @@ from post_process import post_process
from util import logger, Web
MONITORING = True
BYPASS_RPS = False
class ChanScanner:
@ -202,6 +203,9 @@ if __name__ == "__main__":
chan = sys.argv[2]
chan_helper = CHANS[chan]
if BYPASS_RPS:
chan_helper.rps = 10
if MONITORING:
monitoring.init()
state = ChanState()