add chanon

This commit is contained in:
simon 2019-09-12 20:40:31 -04:00
parent d8f63f73cf
commit cfd2f3bdce
4 changed files with 78 additions and 3 deletions

View File

@ -6,7 +6,7 @@ image boards and publishes serialised JSON to RabbitMQ
Compatible image boards: 4chan, lainchan, uboachan, Compatible image boards: 4chan, lainchan, uboachan,
22chan, wizchan, 1chan, 2ch.hk, endchan, 38chan, alokal, 22chan, wizchan, 1chan, 2ch.hk, endchan, 38chan, alokal,
horochan, doushio, desuchan, tgchan, lolnada, 7chan. horochan, doushio, desuchan, tgchan, lolnada, 7chan, chanon.
Can optionally push monitoring data to InfluxDB. Below is an Can optionally push monitoring data to InfluxDB. Below is an
example of Grafana being used to display it. example of Grafana being used to display it.

View File

@ -1,6 +1,7 @@
from chan.alokal_json import AlokalJsonChanHelper from chan.alokal_json import AlokalJsonChanHelper
from chan.chan410_html import Chan410HtmlChanHelper from chan.chan410_html import Chan410HtmlChanHelper
from chan.chan7_html import Chan7HtmlChanHelper from chan.chan7_html import Chan7HtmlChanHelper
from chan.chanon_html import ChanonHtmlChanHelper
from chan.desuchan_html import DesuChanHtmlChanHelper from chan.desuchan_html import DesuChanHtmlChanHelper
from chan.doushio_html import DoushioHtmlChanHelper from chan.doushio_html import DoushioHtmlChanHelper
from chan.endchan_html import EndchanHtmlChanHelper from chan.endchan_html import EndchanHtmlChanHelper
@ -326,5 +327,17 @@ CHANS = {
"sm", "ss", "unf", "v", "sm", "ss", "unf", "v",
), ),
rps=1 / 30 rps=1 / 30
),
"chanon": ChanonHtmlChanHelper(
25,
"https://chanon.ro/",
"https://chanon.ro/",
"/res/",
"/srs/",
(
"a", "int", "j", "m", "pc", "pol", "prog", "tv",
"b", "milo", "pr0n", "s", "c", "sug",
),
rps=1 / 30
) )
} }

63
chan/chanon_html.py Normal file
View File

@ -0,0 +1,63 @@
import datetime
import re
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from chan.desuchan_html import DesuChanHtmlChanHelper
def _ro_datefmt(text):
return re.sub(r"\s*[A-Z]\w{2,} ", "", text)
def _ts(time, r):
# /int/ (International) board is in english...
if "/int/" in r.url:
return int(datetime.datetime.strptime(time, "\n%d-%m-%y (%a) %H:%M:%S\n").timestamp())
return int(datetime.datetime.strptime(_ro_datefmt(time), "\n[%d.%m.%Y](%H:%M:%S)\n").timestamp())
class ChanonHtmlChanHelper(DesuChanHtmlChanHelper):
def parse_threads_list(self, r):
soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'), "html.parser")
threads = []
for threadEl in soup.find_all("div", id=lambda tid: tid and re.match("thread([0-9]+)[a-zA-Z]*", tid)):
omit = threadEl.find("span", class_="omittedposts")
threads.append({
"id": int(re.search("thread([0-9]+)[a-zA-Z]*", threadEl.get("id")).group(1)),
"omit": int(omit.text.split("\n")[1]) if omit else 0
})
for form in soup.find_all("form"):
next_button = form.find("input", attrs={"value": "Înainte"})
if next_button and form.get("action") != "none":
return threads, urljoin(self._base_url, form.get("action"))
return threads, None
@staticmethod
def parse_thread(r):
soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'), "html.parser")
thread_el = soup.find("div", id=lambda x: x and re.match("thread[0-9]+[a-zA-Z]*", x))
for post_el in thread_el.find_all("table", recursive=False):
*_, time = post_el.find("label").children
yield {
"id": int(post_el.find("td", attrs={"class", "reply"}).get("id")[5:]),
"type": "post",
"html": str(post_el),
"time": _ts(time, r)
}
post_el.decompose()
*_, time = thread_el.find("label").children
yield {
"id": int(re.search("thread([0-9]+)[a-zA-Z]*", thread_el.get("id")).group(1)),
"type": "thread",
"html": str(thread_el),
"time": _ts(time, r)
}

View File

@ -58,8 +58,7 @@ class DesuChanHtmlChanHelper(ChanHelper):
op_el = soup.find("div", id=lambda tid: tid and tid[1:].isdigit()) op_el = soup.find("div", id=lambda tid: tid and tid[1:].isdigit())
for post_el in op_el.find_all("table", recursive=False): for post_el in op_el.find_all("table", recursive=False):
label = post_el.find("label") *_, time = post_el.find("label").children
*_, time = label.children
yield { yield {
"id": int(post_el.find("td", attrs={"class", "reply"}).get("id")[5:]), "id": int(post_el.find("td", attrs={"class", "reply"}).get("id")[5:]),
"type": "post", "type": "post",