add desuchan

This commit is contained in:
simon 2019-09-08 15:50:05 -04:00
parent 18247d4139
commit e13770d603
3 changed files with 100 additions and 2 deletions

View File

@ -6,7 +6,7 @@ image boards and publishes serialised JSON to RabbitMQ
Compatible image boards: 4chan, lainchan, uboachan, Compatible image boards: 4chan, lainchan, uboachan,
22chan, wizchan, 1chan, 2ch.hk, endchan, 38chan, alokal, 22chan, wizchan, 1chan, 2ch.hk, endchan, 38chan, alokal,
horochan, doushio. horochan, doushio, desuchan.
Can optionally push monitoring data to InfluxDB. Below is an Can optionally push monitoring data to InfluxDB. Below is an
example of Grafana being used to display it. example of Grafana being used to display it.

View File

@ -1,4 +1,5 @@
from chan.alokal_json import AlokalJsonChanHelper from chan.alokal_json import AlokalJsonChanHelper
from chan.desuchan_html import DesuChanHtmlChanHelper
from chan.doushio_html import DoushioHtmlChanHelper from chan.doushio_html import DoushioHtmlChanHelper
from chan.endchan_html import EndchanHtmlChanHelper from chan.endchan_html import EndchanHtmlChanHelper
from chan.json import JsonChanHelper from chan.json import JsonChanHelper
@ -213,5 +214,22 @@ CHANS = {
"moe", "moe",
), ),
rps=1 rps=1
) ),
"desuchan": DesuChanHtmlChanHelper(
17,
"https://desuchan.net/",
"https://desuchan.net/",
"/res/",
"/src/",
(
"bananas", "boku", "dawa", "desu", "jum", "kashira", "md",
"otousama", "ro", "unyuu", "yakult", "a", "c", "h", "_loli",
"moonspeak", "nagato", "nij", "nipa", "touhou", "tr", "yan",
"yan", "vn", "do", "fi", "lit", "o", "pro", "tech", "v", "vic",
"arrrrr", "brocastan", "gar", "gif", "media", "ot", "r", "w",
"sandbox", "sugg"
),
rps=1/10
),
} }

80
chan/desuchan_html.py Normal file
View File

@ -0,0 +1,80 @@
import datetime
from bs4 import BeautifulSoup
from chan.helper import ChanHelper
from post_process import get_links_from_html_body
class DesuChanHtmlChanHelper(ChanHelper):
def threads_url(self, board):
return "%s%s/" % (self._base_url, board)
def posts_url(self, board, thread):
return "%s%s%s%d.html" % (self._base_url, board, self._thread_path, thread)
@staticmethod
def item_id(item):
return item["id"]
def item_urls(self, item, board):
return list(set(get_links_from_html_body(item["html"], self._base_url)))
@staticmethod
def item_type(item):
return item["type"]
@staticmethod
def thread_mtime(thread):
return -1 # TODO: Parse the 'X posts, Y images' span
@staticmethod
def item_mtime(item):
return item["time"]
def parse_threads_list(self, r):
soup = BeautifulSoup(r.text, "html.parser")
threads = []
for threadEl in soup.find_all("div", id=lambda tid: tid and tid[1:].isdigit()):
threads.append({
"id": int(threadEl.get("id")[1:]),
})
for form in soup.find_all("form"):
next_button = form.find("input", attrs={"value": "Next"})
if next_button and form.get("action") != "none":
return threads, self._base_url.rstrip("/") + form.get("action")
return threads, None
@staticmethod
def parse_thread(r):
soup = BeautifulSoup(r.text, "html.parser")
op_el = None
for div in soup.find_all("div", id=lambda tid: tid and tid[1:].isdigit()):
op_el = div
break
for post_el in op_el.find_all("table", recursive=False):
label = post_el.find("label")
if not label:
print(post_el)
*_, time = label.children
yield {
"id": int(post_el.find("td", attrs={"class", "reply"}).get("id")[5:]),
"type": "post",
"html": str(post_el),
"time": int(datetime.datetime.strptime(time, "\n%y/%m/%d(%a)%H:%M").timestamp())
}
post_el.decompose()
*_, time = op_el.find("label").children
yield {
"id": int(op_el.get("id")[1:]),
"type": "thread",
"html": str(op_el),
"time": int(datetime.datetime.strptime(time, "\n%y/%m/%d(%a)%H:%M").timestamp())
}