From 60fa4893d8b117e39915edc7d8f429a39c6a0b98 Mon Sep 17 00:00:00 2001 From: simon Date: Sun, 8 Sep 2019 17:26:33 -0400 Subject: [PATCH] add lolnada --- README.md | 2 +- chan/chan.py | 25 ++++++++++---- chan/desuchan_html.py | 2 +- chan/doushio_html.py | 2 +- chan/endchan_html.py | 2 +- chan/helper.py | 2 +- chan/lolnada_html.py | 76 +++++++++++++++++++++++++++++++++++++++++++ chan/mayuri.py | 2 +- run.py | 2 +- 9 files changed, 102 insertions(+), 13 deletions(-) create mode 100644 chan/lolnada_html.py diff --git a/README.md b/README.md index 5307c21..7876c33 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ image boards and publishes serialised JSON to RabbitMQ Compatible image boards: 4chan, lainchan, uboachan, 22chan, wizchan, 1chan, 2ch.hk, endchan, 38chan, alokal, -horochan, doushio, desuchan, tgchan. +horochan, doushio, desuchan, tgchan, lolnada. Can optionally push monitoring data to InfluxDB. Below is an example of Grafana being used to display it. diff --git a/chan/chan.py b/chan/chan.py index ca01d5e..4995821 100644 --- a/chan/chan.py +++ b/chan/chan.py @@ -3,6 +3,7 @@ from chan.desuchan_html import DesuChanHtmlChanHelper from chan.doushio_html import DoushioHtmlChanHelper from chan.endchan_html import EndchanHtmlChanHelper from chan.json import JsonChanHelper +from chan.lolnada_html import LolNadaHtmlChanHelper from chan.mayuri import MayuriChanHelper from chan.russian_json import RussianJsonChanHelper from chan.synch_json import SynchJsonChanHelper @@ -105,7 +106,7 @@ CHANS = { "a", "fd", "ja", "ma", "vn", "fg", "fur", "gg", "ga", "vape", "h", "ho", "hc", "e", "fet", "sex", "fag" ), - rps=1 + rps=1/10 ), "endchan": EndchanHtmlChanHelper( 8, @@ -126,7 +127,7 @@ CHANS = { "ausneets", "qanonresearch", "polru", "yuri", "christianity", "kc", "rapport", "news", "brit", "webm", "4chon" ), - rps=1 + rps=1/2 ), "38chan": JsonChanHelper( 9, @@ -149,7 +150,7 @@ CHANS = { "b", "pol", "sk", "int", "slav", "s", "gv", "mda", "sp", "fit", "had", ), - rps=1 / 4 + rps=1 / 30 ), "gnfos": JsonChanHelper( 11, @@ -214,7 +215,7 @@ CHANS = { ( "moe", ), - rps=1 + rps=1/20 ), "desuchan": DesuChanHtmlChanHelper( 17, @@ -253,6 +254,18 @@ CHANS = { ( "draw", "meep", "quest", "questdis", "tg", "icons", ), - rps=1, - ) + rps=1/600, + ), + "lolnada": LolNadaHtmlChanHelper( + 20, + "https://lolnada.org/", + "https://lolnada.org/", + "/hilo/", + "/src/", + ( + "b", "a", "aw", "cgl", "dw", "int", "qt", "sad", "t", + "toy", "v", "x", "34", "e", "f", "h" + ), + rps=1/20, + ), } diff --git a/chan/desuchan_html.py b/chan/desuchan_html.py index c9d2cea..fca50f1 100644 --- a/chan/desuchan_html.py +++ b/chan/desuchan_html.py @@ -12,7 +12,7 @@ class DesuChanHtmlChanHelper(ChanHelper): return "%s%s/" % (self._base_url, board) def posts_url(self, board, thread): - return "%s%s%s%d.html" % (self._base_url, board, self._thread_path, thread) + return "%s%s%s%d.html" % (self._base_url, board, self._thread_path, self.item_id(thread)) @staticmethod def item_id(item): diff --git a/chan/doushio_html.py b/chan/doushio_html.py index add3de6..a39dc01 100644 --- a/chan/doushio_html.py +++ b/chan/doushio_html.py @@ -13,7 +13,7 @@ class DoushioHtmlChanHelper(ChanHelper): return "%s%s/" % (self._base_url, board) def posts_url(self, board, thread): - return "%s%s/%d" % (self._base_url, board, thread) + return "%s%s/%d" % (self._base_url, board, self.item_id(thread)) @staticmethod def item_id(item): diff --git a/chan/endchan_html.py b/chan/endchan_html.py index 1ba5778..70869e5 100644 --- a/chan/endchan_html.py +++ b/chan/endchan_html.py @@ -13,7 +13,7 @@ class EndchanHtmlChanHelper(ChanHelper): return "%s%s/" % (self._base_url, board) def posts_url(self, board, thread): - return "%s%s%s%d.html" % (self._base_url, board, self._thread_path, thread) + return "%s%s%s%d.html" % (self._base_url, board, self._thread_path, self.item_id(thread)) @staticmethod def item_id(item): diff --git a/chan/helper.py b/chan/helper.py index 0dc13f2..64546bd 100644 --- a/chan/helper.py +++ b/chan/helper.py @@ -21,7 +21,7 @@ class ChanHelper: return "%s%s/threads.json" % (self._base_url, board) def posts_url(self, board, thread): - return "%s%s%s%d.json" % (self._base_url, board, self._thread_path, thread) + return "%s%s%s%d.json" % (self._base_url, board, self._thread_path, self.item_id(thread)) def board_hash(self, board): return str((self._boards.index(board) + 1) * 10000) diff --git a/chan/lolnada_html.py b/chan/lolnada_html.py new file mode 100644 index 0000000..5b10795 --- /dev/null +++ b/chan/lolnada_html.py @@ -0,0 +1,76 @@ +from urllib.parse import urljoin + +from bs4 import BeautifulSoup +from dateutil import parser + +from chan.helper import ChanHelper +from post_process import get_links_from_html_body + + +class LolNadaHtmlChanHelper(ChanHelper): + + def threads_url(self, board): + return "%s%s/" % (self._base_url, board) + + def posts_url(self, board, thread): + return "%s%s" % (self._base_url, thread["url"]) + + @staticmethod + def item_id(item): + return item["id"] + + def item_urls(self, item, board): + return [ + x for + x in set(get_links_from_html_body(item["html"], self._base_url)) + if "google.com" not in x and "iqdb.org" not in x + ] + + @staticmethod + def item_type(item): + return item["type"] + + @staticmethod + def thread_mtime(thread): + return -1 + + @staticmethod + def item_mtime(item): + return item["time"] + + def parse_threads_list(self, r): + soup = BeautifulSoup(r.text, "html.parser") + + threads = [] + + for threadEl in soup.find_all("div", class_="hilo"): + threads.append({ + "id": int(threadEl.get("data-id")), + "url": threadEl.find("a", class_="post_no").get("href"), + }) + + for form in soup.find_all("form"): + next_button = form.find("input", attrs={"value": "Siguiente"}) + if next_button and form.get("action") != "none": + return threads, urljoin(self._base_url, form.get("action")) + return threads, None + + @staticmethod + def parse_thread(r): + soup = BeautifulSoup(r.text, "html.parser") + + op_el = soup.find("div", class_="hilo") + for post_el in op_el.find_all("div", class_="post reply"): + yield { + "id": int(post_el.get("id")[6:]), + "type": "post", + "html": str(post_el), + "time": int(parser.parse(post_el.find("time").get("datetime")).timestamp()) + } + post_el.decompose() + yield { + "id": int(op_el.get("id")[5:]), + "type": "thread", + "html": str(op_el), + "time": int(parser.parse(op_el.find("time").get("datetime")).timestamp()) + } diff --git a/chan/mayuri.py b/chan/mayuri.py index 3a47b00..6c57256 100644 --- a/chan/mayuri.py +++ b/chan/mayuri.py @@ -72,4 +72,4 @@ class MayuriChanHelper(ChanHelper): return "%sboards/1" % (self._base_url,) def posts_url(self, board, thread): - return "%sthreads/%d" % (self._base_url, thread) + return "%sthreads/%d" % (self._base_url, self.item_id(thread)) diff --git a/run.py b/run.py index 52a7a47..398387b 100644 --- a/run.py +++ b/run.py @@ -47,7 +47,7 @@ class ChanScanner: def _posts(self, board): for thread in self._threads(board): if self.state.has_new_posts(thread, self.helper, board): - for post in self._fetch_posts(board, self.helper.item_id(thread)): + for post in self._fetch_posts(board, thread): yield post self.state.mark_thread_as_visited(thread, self.helper, board)