From 77a053d6ee294fceafe34f8304d884b3ebfff7c6 Mon Sep 17 00:00:00 2001 From: simon987 Date: Wed, 25 Dec 2019 17:21:37 -0500 Subject: [PATCH] Add two Lynx chans, update dependencies --- README.md | 2 +- chan/alokal_json.py | 2 +- chan/chan.py | 26 ++++++++++- chan/{json.py => chan_json.py} | 0 chan/json_kun.py | 2 +- chan/lynx.py | 83 ++++++++++++++++++++++++++++++++++ chan/synch_json.py | 2 +- requirements.txt | 4 +- run.py | 10 ++-- 9 files changed, 119 insertions(+), 12 deletions(-) rename chan/{json.py => chan_json.py} (100%) create mode 100644 chan/lynx.py diff --git a/README.md b/README.md index a79a4d4..784982e 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ image boards and publishes serialised JSON to RabbitMQ Compatible image boards: 4chan, lainchan, uboachan, 22chan, wizchan, 1chan, 2ch.hk, endchan, 38chan, alokal, horochan, doushio, desuchan, tgchan, lolnada, 7chan, chanon, -chan.org.li, hispachan, 8kun, nowere, iichan and more. +chan.org.li, hispachan, 8kun, nowere, iichan, 2chan and more. Can optionally push monitoring data to InfluxDB. Below is an example of Grafana being used to display it. diff --git a/chan/alokal_json.py b/chan/alokal_json.py index 421062a..fc1a42d 100644 --- a/chan/alokal_json.py +++ b/chan/alokal_json.py @@ -1,4 +1,4 @@ -from chan.json import JsonChanHelper +from chan.chan_json import JsonChanHelper from post_process import get_links_from_body diff --git a/chan/chan.py b/chan/chan.py index fb0bca4..a6cf882 100644 --- a/chan/chan.py +++ b/chan/chan.py @@ -9,10 +9,11 @@ from chan.endchan_html import EndchanHtmlChanHelper from chan.fchan_html import FChanHtmlChanHelper from chan.hispachan_html import HispachanHtmlHelper from chan.iichan_html import IichanHtmlChanHelper -from chan.json import JsonChanHelper +from chan.chan_json import JsonChanHelper from chan.json_kun import JsonKunChanHelper from chan.kev4_php import Kev4PhpHelper from chan.lolnada_html import LolNadaHtmlChanHelper +from chan.lynx import LynxChanHelper from chan.mayuri import MayuriChanHelper from chan.nowere_html import NowereHtmlChanHelper from chan.plus4chan_html import Plus4ChanHelper @@ -596,5 +597,26 @@ CHANS = { ), rps=1 / 3 ), - # next is 36 + "waifuist": LynxChanHelper( + 36, + "https://waifuist.pro/", + "https://waifuist.pro/", + "/res/", + "", + ( + "w", "starlet", "etc", + ), + rps=1 / 25 + ), + "cutiegarden": LynxChanHelper( + 37, + "https://cutie.garden/", + "https://cutie.garden/", + "/res/", + "", + ( + "lg", "cozy", "meta", "test" + ), + rps=1 / 25 + ), } diff --git a/chan/json.py b/chan/chan_json.py similarity index 100% rename from chan/json.py rename to chan/chan_json.py diff --git a/chan/json_kun.py b/chan/json_kun.py index a626904..4ce04fc 100644 --- a/chan/json_kun.py +++ b/chan/json_kun.py @@ -1,6 +1,6 @@ from vanwanet_scrape.scraper import Scraper -from chan.json import JsonChanHelper +from chan.chan_json import JsonChanHelper from util import logger diff --git a/chan/lynx.py b/chan/lynx.py new file mode 100644 index 0000000..f8b2daf --- /dev/null +++ b/chan/lynx.py @@ -0,0 +1,83 @@ +import json +from datetime import datetime +from json import JSONDecodeError +from urllib.parse import urljoin + +import cloudscraper + +from chan.helper import ChanHelper +from util import logger + + +class LynxChanHelper(ChanHelper): + """See https://gitgud.io/LynxChan/LynxChan/blob/master/doc/Json.txt""" + + def __init__(self, db_id, base_url, image_url, thread_path, image_path, boards, rps): + super().__init__(db_id, base_url, image_url, thread_path, image_path, boards, rps) + + scraper = cloudscraper.create_scraper() + self.get_method = scraper.get + + @staticmethod + def item_id(item): + return item["threadId"] if LynxChanHelper.item_type(item) == "thread" else item["postId"] + + @staticmethod + def item_mtime(item): + return datetime.fromisoformat(item["creation"]).timestamp() + + def item_urls(self, item, board): + return [ + urljoin(self._base_url, im["path"]) + for im in item["files"] + ] if "files" in item and item["files"] else [] + + @staticmethod + def item_type(item): + return "thread" if "threadId" in item else "post" + + def threads_url(self, board): + return "%s%s/1.json" % (self._base_url, board) + + @staticmethod + def thread_mtime(thread): + return (thread["ommitedPosts"] if "ommitedPosts" in thread else 0) + len(thread["posts"]) + + @staticmethod + def parse_threads_list(r): + try: + j = json.loads(r.content.decode('utf-8', 'ignore')) + if len(j) == 0 or "threads" not in j: + logger.warning("No threads in response for %s: %s" % (r.url, r.text,)) + return [], None + except JSONDecodeError: + logger.warning("JSONDecodeError for %s:" % (r.url,)) + logger.warning(r.text) + return [], None + + next_page = None + url = r.url[:r.url.rfind("?")] if "?" in r.url else r.url + current_page = int(url[url.rfind("/") + 1:-5]) + if current_page < j["pageCount"]: + next_page = urljoin(r.url, "%d.json" % (current_page + 1)) + + return j["threads"], next_page + + @staticmethod + def parse_thread(r): + try: + j = json.loads(r.content.decode('utf-8', 'ignore')) + except JSONDecodeError: + logger.warning("JSONDecodeError for %s:" % (r.url,)) + logger.warning(r.text) + return [] + + all_items = [] + for post in j["posts"]: + post["_parent"] = j["threadId"] + all_items.append(post) + + del j["posts"] + all_items.append(j) + + return all_items diff --git a/chan/synch_json.py b/chan/synch_json.py index 0e6d099..3768093 100644 --- a/chan/synch_json.py +++ b/chan/synch_json.py @@ -1,4 +1,4 @@ -from chan.json import JsonChanHelper +from chan.chan_json import JsonChanHelper from post_process import get_links_from_body diff --git a/requirements.txt b/requirements.txt index fe9c1d1..993b28f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,6 @@ influxdb pika bs4 urllib3 -git+git://github.com/simon987/hexlib.git \ No newline at end of file +git+git://github.com/simon987/hexlib.git +git+git://github.com/simon987/vanwanet_scrape.git +cloudscraper \ No newline at end of file diff --git a/run.py b/run.py index fcf4e62..f9635f7 100644 --- a/run.py +++ b/run.py @@ -102,14 +102,14 @@ class ChanState: conn.commit() def mark_visited(self, item: int, helper): - with sqlite3.connect(self._db) as conn: + with sqlite3.connect(self._db, timeout=10000) as conn: conn.execute( "INSERT INTO posts (post, chan) VALUES (?,?)", (item, helper.db_id) ) def has_visited(self, item: int, helper): - with sqlite3.connect(self._db) as conn: + with sqlite3.connect(self._db, timeout=10000) as conn: cur = conn.cursor() cur.execute( "SELECT post FROM posts WHERE post=? AND chan=?", @@ -122,7 +122,7 @@ class ChanState: if mtime == -1: return True - with sqlite3.connect(self._db, timeout=5000) as conn: + with sqlite3.connect(self._db, timeout=10000) as conn: cur = conn.cursor() cur.execute( "SELECT last_modified, ts FROM threads WHERE thread=? AND chan=?", @@ -134,7 +134,7 @@ class ChanState: return False def mark_thread_as_visited(self, thread, helper, board): - with sqlite3.connect(self._db, timeout=5000) as conn: + with sqlite3.connect(self._db, timeout=10000) as conn: conn.execute( "INSERT INTO threads (thread, last_modified, chan) " "VALUES (?,?,?) " @@ -243,7 +243,7 @@ if __name__ == "__main__": state = ChanState() publish_q = Queue() - for _ in range(5): + for _ in range(10): publish_thread = Thread(target=publish_worker, args=(publish_q, chan_helper, proxy)) publish_thread.setDaemon(True) publish_thread.start()