mirror of
https://github.com/simon987/chan_feed.git
synced 2025-04-10 14:06:42 +00:00
Link to parent, version bump, add iichan
This commit is contained in:
parent
8ef56871a9
commit
828a14ee98
16
chan/chan.py
16
chan/chan.py
@ -6,6 +6,7 @@ from chan.desuchan_html import DesuChanHtmlChanHelper
|
||||
from chan.doushio_html import DoushioHtmlChanHelper
|
||||
from chan.endchan_html import EndchanHtmlChanHelper
|
||||
from chan.fchan_html import FChanHtmlChanHelper
|
||||
from chan.iichan_html import IichanHtmlChanHelper
|
||||
from chan.json import JsonChanHelper
|
||||
from chan.lolnada_html import LolNadaHtmlChanHelper
|
||||
from chan.mayuri import MayuriChanHelper
|
||||
@ -350,5 +351,20 @@ CHANS = {
|
||||
"b", "goys"
|
||||
),
|
||||
rps=1 / 60
|
||||
),
|
||||
"iichan": IichanHtmlChanHelper(
|
||||
27,
|
||||
"https://iichan.hk/",
|
||||
"https://iichan.hk/",
|
||||
"/res/",
|
||||
"/src/",
|
||||
(
|
||||
"d", "b", "bro", "ci", "cu", "dev", "gf", "hr", "l",
|
||||
"m", "med", "mi", "mu", "o", "ph", "r", "s", "sci",
|
||||
"tran", "tu", "tv", "x", "es", "vq", "au", "tr", "a",
|
||||
"aa", "abe", "c", "fi", "jp", "rm", "tan", "to", "ts",
|
||||
"vn", "vo", "misc"
|
||||
),
|
||||
rps=1 / 10
|
||||
)
|
||||
}
|
||||
|
@ -32,20 +32,27 @@ class Chan410HtmlChanHelper(DesuChanHtmlChanHelper):
|
||||
|
||||
op_el = soup.find("form", id="delform")
|
||||
|
||||
posts = []
|
||||
for post_el in op_el.find_all("div", class_="reply"):
|
||||
yield {
|
||||
posts.append({
|
||||
"id": int(post_el.get("id")[5:]),
|
||||
"type": "post",
|
||||
"html": str(post_el),
|
||||
"time": int(datetime.datetime.strptime(_ru_datefmt(op_el.find("span", class_="time").text),
|
||||
"%d.%m.%Y %H:%M:%S").timestamp())
|
||||
}
|
||||
})
|
||||
post_el.decompose()
|
||||
|
||||
tid = int(op_el.find("a", attrs={"name": lambda x: x and x.isdigit()}).get("name"))
|
||||
yield {
|
||||
"id": int(op_el.find("a", attrs={"name": lambda x: x and x.isdigit()}).get("name")),
|
||||
"id": tid,
|
||||
"type": "thread",
|
||||
"html": str(op_el),
|
||||
"time": int(datetime.datetime.strptime(_ru_datefmt(op_el.find("span", class_="time").text),
|
||||
"%d.%m.%Y %H:%M:%S").timestamp())
|
||||
}
|
||||
|
||||
for post in posts:
|
||||
post["parent"] = tid
|
||||
yield post
|
||||
|
||||
|
@ -38,8 +38,9 @@ class Chan7HtmlChanHelper(DesuChanHtmlChanHelper):
|
||||
thread_el = soup.find("div", id=lambda x: x and re.match("thread_[0-9]+_[a-zA-Z]*", x))
|
||||
op_el = thread_el.find("div", class_="post")
|
||||
time = "".join(s for s in op_el.find("div", class_="post_header").contents if isinstance(s, str))
|
||||
tid = int(op_el.get("id"))
|
||||
yield {
|
||||
"id": int(op_el.get("id")),
|
||||
"id": tid,
|
||||
"type": "thread",
|
||||
"html": str(op_el),
|
||||
"time": int(datetime.datetime.strptime(_trim_time(time), "\n%y/%m/%d(%a)%H:%M\n").timestamp())
|
||||
@ -51,5 +52,6 @@ class Chan7HtmlChanHelper(DesuChanHtmlChanHelper):
|
||||
"id": int(post_el.get("id")[6:]),
|
||||
"type": "post",
|
||||
"html": str(post_el),
|
||||
"time": int(datetime.datetime.strptime(_trim_time(time), "\n%y/%m/%d(%a)%H:%M\n").timestamp())
|
||||
"time": int(datetime.datetime.strptime(_trim_time(time), "\n%y/%m/%d(%a)%H:%M\n").timestamp()),
|
||||
"parent": tid
|
||||
}
|
||||
|
@ -44,19 +44,21 @@ class ChanonHtmlChanHelper(DesuChanHtmlChanHelper):
|
||||
|
||||
thread_el = soup.find("div", id=lambda x: x and re.match("thread[0-9]+[a-zA-Z]*", x))
|
||||
|
||||
tid = int(re.search("thread([0-9]+)[a-zA-Z]*", thread_el.get("id")).group(1))
|
||||
for post_el in thread_el.find_all("table", recursive=False):
|
||||
*_, time = post_el.find("label").children
|
||||
yield {
|
||||
"id": int(post_el.find("td", attrs={"class", "reply"}).get("id")[5:]),
|
||||
"type": "post",
|
||||
"html": str(post_el),
|
||||
"time": _ts(time, r)
|
||||
"time": _ts(time, r),
|
||||
"parent": tid,
|
||||
}
|
||||
post_el.decompose()
|
||||
|
||||
*_, time = thread_el.find("label").children
|
||||
yield {
|
||||
"id": int(re.search("thread([0-9]+)[a-zA-Z]*", thread_el.get("id")).group(1)),
|
||||
"id": tid,
|
||||
"type": "thread",
|
||||
"html": str(thread_el),
|
||||
"time": _ts(time, r)
|
||||
|
@ -57,19 +57,21 @@ class DesuChanHtmlChanHelper(ChanHelper):
|
||||
|
||||
op_el = soup.find("div", id=lambda tid: tid and tid[1:].isdigit())
|
||||
|
||||
tid = int(op_el.get("id")[1:])
|
||||
for post_el in op_el.find_all("table", recursive=False):
|
||||
*_, time = post_el.find("label").children
|
||||
yield {
|
||||
"id": int(post_el.find("td", attrs={"class", "reply"}).get("id")[5:]),
|
||||
"type": "post",
|
||||
"html": str(post_el),
|
||||
"time": int(datetime.datetime.strptime(time, "\n%y/%m/%d(%a)%H:%M").timestamp())
|
||||
"time": int(datetime.datetime.strptime(time, "\n%y/%m/%d(%a)%H:%M").timestamp()),
|
||||
"parent": tid
|
||||
}
|
||||
post_el.decompose()
|
||||
|
||||
*_, time = op_el.find("label").children
|
||||
yield {
|
||||
"id": int(op_el.get("id")[1:]),
|
||||
"id": tid,
|
||||
"type": "thread",
|
||||
"html": str(op_el),
|
||||
"time": int(datetime.datetime.strptime(time, "\n%y/%m/%d(%a)%H:%M").timestamp())
|
||||
|
@ -56,16 +56,18 @@ class DoushioHtmlChanHelper(ChanHelper):
|
||||
soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'), "html.parser")
|
||||
|
||||
op_el = soup.find("section")
|
||||
tid = int(op_el.get("id"))
|
||||
for post_el in op_el.find_all("article"):
|
||||
yield {
|
||||
"id": int(post_el.get("id")),
|
||||
"type": "post",
|
||||
"html": str(post_el),
|
||||
"time": int(parser.parse(post_el.find("header").find("time").get("datetime")).timestamp())
|
||||
"time": int(parser.parse(post_el.find("header").find("time").get("datetime")).timestamp()),
|
||||
"parent": tid
|
||||
}
|
||||
post_el.decompose()
|
||||
yield {
|
||||
"id": int(op_el.get("id")),
|
||||
"id": tid,
|
||||
"type": "thread",
|
||||
"html": str(op_el),
|
||||
"time": int(parser.parse(op_el.find("header").find("time").get("datetime")).timestamp())
|
||||
|
@ -58,8 +58,9 @@ class EndchanHtmlChanHelper(ChanHelper):
|
||||
op_el = soup.find("div", attrs={"class": "innerOP"})
|
||||
if not op_el:
|
||||
return []
|
||||
tid = int(soup.find("div", class_="opCell").get("id"))
|
||||
yield {
|
||||
"id": int(soup.find("div", class_="opCell").get("id")),
|
||||
"id": tid,
|
||||
"type": "thread",
|
||||
"html": str(op_el),
|
||||
"time": int(datetime.datetime.strptime(op_el.find("span", class_="labelCreated").text,
|
||||
@ -72,5 +73,6 @@ class EndchanHtmlChanHelper(ChanHelper):
|
||||
"type": "post",
|
||||
"html": str(post_el),
|
||||
"time": int(datetime.datetime.strptime(post_el.find("span", class_="labelCreated").text,
|
||||
"%m/%d/%Y (%a) %H:%M:%S").timestamp())
|
||||
"%m/%d/%Y (%a) %H:%M:%S").timestamp()),
|
||||
"parent": tid
|
||||
}
|
||||
|
@ -34,22 +34,29 @@ class FChanHtmlChanHelper(DesuChanHtmlChanHelper):
|
||||
|
||||
is_op = True
|
||||
|
||||
posts = []
|
||||
tid = None
|
||||
for post_el in op_el.find_all("table", recursive=False):
|
||||
label = post_el.find("label")
|
||||
*_, time = label.children
|
||||
if is_op:
|
||||
tid = int(op_el.get("id")[6:])
|
||||
yield {
|
||||
"id": int(op_el.get("id")[6:]),
|
||||
"id": tid,
|
||||
"type": "thread",
|
||||
"html": str(post_el),
|
||||
"time": int(datetime.datetime.strptime(time.strip(), "%y/%m/%d(%a)%H:%M").timestamp())
|
||||
}
|
||||
is_op = False
|
||||
else:
|
||||
yield {
|
||||
posts.append({
|
||||
"id": int(post_el.find("td", class_=lambda x: x and "reply" in x).get("id")[5:]),
|
||||
"type": "post",
|
||||
"html": str(post_el),
|
||||
"time": int(datetime.datetime.strptime(time.strip(), "%y/%m/%d(%a)%H:%M").timestamp())
|
||||
}
|
||||
})
|
||||
|
||||
for post in posts:
|
||||
post["parent"] = tid
|
||||
yield post
|
||||
|
||||
|
78
chan/iichan_html.py
Normal file
78
chan/iichan_html.py
Normal file
@ -0,0 +1,78 @@
|
||||
import datetime
|
||||
import re
|
||||
from urllib.parse import urljoin
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from chan.desuchan_html import DesuChanHtmlChanHelper
|
||||
from util import logger
|
||||
|
||||
|
||||
def _ts(text):
|
||||
time = re.sub(r"^\w{2} ", "", text.strip()) \
|
||||
.replace("января", "01") \
|
||||
.replace("февраля", "02") \
|
||||
.replace("марта", "03") \
|
||||
.replace("апреля", "04") \
|
||||
.replace("мая", "05") \
|
||||
.replace("июня", "06") \
|
||||
.replace("июля", "07") \
|
||||
.replace("августа", "08") \
|
||||
.replace("сентября", "09") \
|
||||
.replace("октября", "10") \
|
||||
.replace("ноября", "11") \
|
||||
.replace("декабря", "12") \
|
||||
.replace("⑨", "9")
|
||||
# For some reason, some dates are fuzzed / in chinese
|
||||
try:
|
||||
return int(datetime.datetime.strptime(time, "%d %m %Y %H:%M:%S").timestamp())
|
||||
except Exception as e:
|
||||
logger.warning("Error during date parsing (iichan): " + str(e))
|
||||
return 0
|
||||
|
||||
|
||||
class IichanHtmlChanHelper(DesuChanHtmlChanHelper):
|
||||
|
||||
def parse_threads_list(self, r):
|
||||
soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'), "html.parser")
|
||||
|
||||
threads = []
|
||||
|
||||
for threadEl in soup.find_all("div", id=lambda tid: tid and re.match("thread-([0-9]+)$", tid)):
|
||||
omit = threadEl.find("span", class_="omittedposts")
|
||||
threads.append({
|
||||
"id": int(re.search("thread-([0-9]+)", threadEl.get("id")).group(1)),
|
||||
"omit": int(omit.text.strip().split(" ")[1]) if omit else 0
|
||||
})
|
||||
|
||||
for form in soup.find_all("form"):
|
||||
next_button = form.find("input", attrs={"value": "Далее"})
|
||||
if next_button and form.get("action") != "none":
|
||||
return threads, urljoin(self._base_url, form.get("action"))
|
||||
return threads, None
|
||||
|
||||
@staticmethod
|
||||
def parse_thread(r):
|
||||
soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'), "html.parser")
|
||||
|
||||
thread_el = soup.find("div", id=lambda x: x and re.match("thread-[0-9]+", x))
|
||||
|
||||
tid = int(re.search("thread-([0-9]+)[a-zA-Z]*", thread_el.get("id")).group(1))
|
||||
for post_el in thread_el.find_all("table", recursive=False):
|
||||
*_, time = post_el.find("label").children
|
||||
yield {
|
||||
"id": int(post_el.find("td", attrs={"class", "reply"}).get("id")[5:]),
|
||||
"type": "post",
|
||||
"html": str(post_el),
|
||||
"time": _ts(time),
|
||||
"parent": tid
|
||||
}
|
||||
post_el.decompose()
|
||||
|
||||
*_, time = thread_el.find("label").children
|
||||
yield {
|
||||
"id": tid,
|
||||
"type": "thread",
|
||||
"html": str(thread_el),
|
||||
"time": _ts(time)
|
||||
}
|
@ -62,16 +62,18 @@ class LolNadaHtmlChanHelper(ChanHelper):
|
||||
soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'), "html.parser")
|
||||
|
||||
op_el = soup.find("div", class_="hilo")
|
||||
tid = int(op_el.get("id")[5:])
|
||||
for post_el in op_el.find_all("div", class_="post reply"):
|
||||
yield {
|
||||
"id": int(post_el.get("id")[6:]),
|
||||
"type": "post",
|
||||
"html": str(post_el),
|
||||
"time": int(parser.parse(post_el.find("time").get("datetime")).timestamp())
|
||||
"time": int(parser.parse(post_el.find("time").get("datetime")).timestamp()),
|
||||
"parent": tid
|
||||
}
|
||||
post_el.decompose()
|
||||
yield {
|
||||
"id": int(op_el.get("id")[5:]),
|
||||
"id": tid,
|
||||
"type": "thread",
|
||||
"html": str(op_el),
|
||||
"time": int(parser.parse(op_el.find("time").get("datetime")).timestamp())
|
||||
|
@ -33,20 +33,26 @@ class TgChanHtmlChanHelper(DesuChanHtmlChanHelper):
|
||||
|
||||
op_el = soup.find("form", id="delform")
|
||||
|
||||
posts = []
|
||||
for post_el in op_el.find_all("table", recursive=False):
|
||||
*_, time = post_el.find("label").children
|
||||
yield {
|
||||
posts.append({
|
||||
"id": int(post_el.find("td", attrs={"class", "reply"}).get("id")[5:]),
|
||||
"type": "post",
|
||||
"html": str(post_el),
|
||||
"time": int(datetime.datetime.strptime(time, "\n\n%Y/%m/%d(%a)%H:%M\n").timestamp())
|
||||
}
|
||||
})
|
||||
post_el.decompose()
|
||||
|
||||
*_, time = op_el.find("label").children
|
||||
tid = int(op_el.find("a", attrs={"name": lambda x: x and x.isdigit()}).get("name"))
|
||||
yield {
|
||||
"id": int(op_el.find("a", attrs={"name": lambda x: x and x.isdigit()}).get("name")),
|
||||
"id": tid,
|
||||
"type": "thread",
|
||||
"html": str(op_el),
|
||||
"time": int(datetime.datetime.strptime(time, "\n\n%Y/%m/%d(%a)%H:%M\n").timestamp())
|
||||
}
|
||||
|
||||
for post in posts:
|
||||
post["parent"] = tid
|
||||
yield post
|
||||
|
@ -62,17 +62,19 @@ class ZerochanHtmlChanHelper(DoushioHtmlChanHelper):
|
||||
|
||||
op_el = soup.find("section", attrs={"data-id": lambda x: x})
|
||||
|
||||
tid = int(op_el.get("data-id")[1:])
|
||||
for post_el in op_el.find_all("article", attrs={"data-id": lambda x: x}):
|
||||
yield {
|
||||
"id": int(post_el.get("data-id")),
|
||||
"type": "post",
|
||||
"html": str(post_el),
|
||||
"time": int(datetime.datetime.strptime(_ru_datefmt(post_el.find("time").text),
|
||||
"%d %b %Y %H:%M").timestamp())
|
||||
"%d %b %Y %H:%M").timestamp()),
|
||||
"parent": tid,
|
||||
}
|
||||
post_el.decompose()
|
||||
yield {
|
||||
"id": int(op_el.get("data-id")[1:]),
|
||||
"id": tid,
|
||||
"type": "thread",
|
||||
"html": str(op_el),
|
||||
"time": int(datetime.datetime.strptime(_ru_datefmt(op_el.find("time").text),
|
||||
|
@ -73,7 +73,7 @@ def image_meta(url, url_idx, web):
|
||||
|
||||
|
||||
def post_process(item, board, helper, web):
|
||||
item["_v"] = 1.5
|
||||
item["_v"] = 1.6
|
||||
item["_id"] = helper.item_unique_id(item, board)
|
||||
|
||||
item["_board"] = board
|
||||
|
4
run.py
4
run.py
@ -15,6 +15,7 @@ from post_process import post_process
|
||||
from util import logger, Web
|
||||
|
||||
MONITORING = True
|
||||
BYPASS_RPS = False
|
||||
|
||||
|
||||
class ChanScanner:
|
||||
@ -202,6 +203,9 @@ if __name__ == "__main__":
|
||||
chan = sys.argv[2]
|
||||
chan_helper = CHANS[chan]
|
||||
|
||||
if BYPASS_RPS:
|
||||
chan_helper.rps = 10
|
||||
|
||||
if MONITORING:
|
||||
monitoring.init()
|
||||
state = ChanState()
|
||||
|
Loading…
x
Reference in New Issue
Block a user