add 7chan

This commit is contained in:
simon 2019-09-12 19:51:38 -04:00
parent fb2f1419d8
commit d8f63f73cf
5 changed files with 77 additions and 9 deletions

View File

@ -6,7 +6,7 @@ image boards and publishes serialised JSON to RabbitMQ
Compatible image boards: 4chan, lainchan, uboachan, Compatible image boards: 4chan, lainchan, uboachan,
22chan, wizchan, 1chan, 2ch.hk, endchan, 38chan, alokal, 22chan, wizchan, 1chan, 2ch.hk, endchan, 38chan, alokal,
horochan, doushio, desuchan, tgchan, lolnada. horochan, doushio, desuchan, tgchan, lolnada, 7chan.
Can optionally push monitoring data to InfluxDB. Below is an Can optionally push monitoring data to InfluxDB. Below is an
example of Grafana being used to display it. example of Grafana being used to display it.

View File

@ -1,5 +1,6 @@
from chan.alokal_json import AlokalJsonChanHelper from chan.alokal_json import AlokalJsonChanHelper
from chan.chan410_html import Chan410HtmlChanHelper from chan.chan410_html import Chan410HtmlChanHelper
from chan.chan7_html import Chan7HtmlChanHelper
from chan.desuchan_html import DesuChanHtmlChanHelper from chan.desuchan_html import DesuChanHtmlChanHelper
from chan.doushio_html import DoushioHtmlChanHelper from chan.doushio_html import DoushioHtmlChanHelper
from chan.endchan_html import EndchanHtmlChanHelper from chan.endchan_html import EndchanHtmlChanHelper
@ -307,6 +308,23 @@ CHANS = {
( (
"d", "b", "cu", "dev", "r", "a", "ts", "ci" "d", "b", "cu", "dev", "r", "a", "ts", "ci"
), ),
rps=1 / 60 rps=1 / 120
),
"7chan": Chan7HtmlChanHelper(
24,
"https://7chan.org/",
"https://7chan.org/",
"/res/",
"/src/",
(
"7ch", "ch7", "irc", "777", "VIP", "civ", "_vip6",
"b", "banner", "fl", "gfx", "fail", "class", "co",
"eh", "fit", "halp", "jew", "lit", "phi", "pr",
"rnb", "sci", "tg", "w", "zom", "a", "grim", "hi",
"me", "rx", "vg", "wp", "x", "cake", "cd", "d", "di",
"elit", "fag", "fur", "gif", "h", "men", "pco", "s",
"sm", "ss", "unf", "v",
),
rps=1 / 30
) )
} }

55
chan/chan7_html.py Normal file
View File

@ -0,0 +1,55 @@
import datetime
import re
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from chan.desuchan_html import DesuChanHtmlChanHelper
def _trim_time(text):
return re.sub(r"ID: \w+", "", text)
class Chan7HtmlChanHelper(DesuChanHtmlChanHelper):
def parse_threads_list(self, r):
soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'), "html.parser")
threads = []
for threadEl in soup.find_all("div", class_="thread"):
omit = threadEl.find("span", class_="omittedposts")
threads.append({
"id": int(re.search("thread_([0-9]+)_[a-zA-Z]*", threadEl.get("id")).group(1)),
"omit": int(omit.text.split("\n")[1]) if omit else 0
})
for form in soup.find_all("form"):
next_button = form.find("input", attrs={"value": "Next"})
if next_button and form.get("action") != "none":
return threads, urljoin(self._base_url, form.get("action"))
return threads, None
@staticmethod
def parse_thread(r):
soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'), "html.parser")
thread_el = soup.find("div", id=lambda x: x and re.match("thread_[0-9]+_[a-zA-Z]*", x))
op_el = thread_el.find("div", class_="post")
time = "".join(s for s in op_el.find("div", class_="post_header").contents if isinstance(s, str))
yield {
"id": int(op_el.get("id")),
"type": "thread",
"html": str(op_el),
"time": int(datetime.datetime.strptime(_trim_time(time), "\n%y/%m/%d(%a)%H:%M\n").timestamp())
}
for post_el in thread_el.find_all("div", class_="reply"):
time = "".join(s for s in op_el.find("div", class_="post_header").contents if isinstance(s, str))
yield {
"id": int(post_el.get("id")[6:]),
"type": "post",
"html": str(post_el),
"time": int(datetime.datetime.strptime(_trim_time(time), "\n%y/%m/%d(%a)%H:%M\n").timestamp())
}

View File

@ -21,13 +21,9 @@ class FChanHtmlChanHelper(DesuChanHtmlChanHelper):
"omit": int(omit.text.split(" ")[0]) if omit and omit.text else 0 "omit": int(omit.text.split(" ")[0]) if omit and omit.text else 0
}) })
next_url = None
for a in soup.find_all("a"): for a in soup.find_all("a"):
if a.text == "Next": if a.text == "Next":
next_url = a return threads, urljoin(r.url, a.get("href"))
break
if next_url:
return threads, urljoin(r.url, next_url.get("href"))
return threads, None return threads, None
@staticmethod @staticmethod

View File

@ -34,8 +34,7 @@ class TgChanHtmlChanHelper(DesuChanHtmlChanHelper):
op_el = soup.find("form", id="delform") op_el = soup.find("form", id="delform")
for post_el in op_el.find_all("table", recursive=False): for post_el in op_el.find_all("table", recursive=False):
label = post_el.find("label") *_, time = post_el.find("label").children
*_, time = label.children
yield { yield {
"id": int(post_el.find("td", attrs={"class", "reply"}).get("id")[5:]), "id": int(post_el.find("td", attrs={"class", "reply"}).get("id")[5:]),
"type": "post", "type": "post",