Add two Lynx chans, update dependencies

This commit is contained in:
simon987 2019-12-25 17:21:37 -05:00
parent 7ea1612b32
commit 77a053d6ee
9 changed files with 119 additions and 12 deletions

View File

@ -7,7 +7,7 @@ image boards and publishes serialised JSON to RabbitMQ
Compatible image boards: 4chan, lainchan, uboachan,
22chan, wizchan, 1chan, 2ch.hk, endchan, 38chan, alokal,
horochan, doushio, desuchan, tgchan, lolnada, 7chan, chanon,
chan.org.li, hispachan, 8kun, nowere, iichan and more.
chan.org.li, hispachan, 8kun, nowere, iichan, 2chan and more.
Can optionally push monitoring data to InfluxDB. Below is an
example of Grafana being used to display it.

View File

@ -1,4 +1,4 @@
from chan.json import JsonChanHelper
from chan.chan_json import JsonChanHelper
from post_process import get_links_from_body

View File

@ -9,10 +9,11 @@ from chan.endchan_html import EndchanHtmlChanHelper
from chan.fchan_html import FChanHtmlChanHelper
from chan.hispachan_html import HispachanHtmlHelper
from chan.iichan_html import IichanHtmlChanHelper
from chan.json import JsonChanHelper
from chan.chan_json import JsonChanHelper
from chan.json_kun import JsonKunChanHelper
from chan.kev4_php import Kev4PhpHelper
from chan.lolnada_html import LolNadaHtmlChanHelper
from chan.lynx import LynxChanHelper
from chan.mayuri import MayuriChanHelper
from chan.nowere_html import NowereHtmlChanHelper
from chan.plus4chan_html import Plus4ChanHelper
@ -596,5 +597,26 @@ CHANS = {
),
rps=1 / 3
),
# next is 36
"waifuist": LynxChanHelper(
36,
"https://waifuist.pro/",
"https://waifuist.pro/",
"/res/",
"",
(
"w", "starlet", "etc",
),
rps=1 / 25
),
"cutiegarden": LynxChanHelper(
37,
"https://cutie.garden/",
"https://cutie.garden/",
"/res/",
"",
(
"lg", "cozy", "meta", "test"
),
rps=1 / 25
),
}

View File

@ -1,6 +1,6 @@
from vanwanet_scrape.scraper import Scraper
from chan.json import JsonChanHelper
from chan.chan_json import JsonChanHelper
from util import logger

83
chan/lynx.py Normal file
View File

@ -0,0 +1,83 @@
import json
from datetime import datetime
from json import JSONDecodeError
from urllib.parse import urljoin
import cloudscraper
from chan.helper import ChanHelper
from util import logger
class LynxChanHelper(ChanHelper):
"""See https://gitgud.io/LynxChan/LynxChan/blob/master/doc/Json.txt"""
def __init__(self, db_id, base_url, image_url, thread_path, image_path, boards, rps):
super().__init__(db_id, base_url, image_url, thread_path, image_path, boards, rps)
scraper = cloudscraper.create_scraper()
self.get_method = scraper.get
@staticmethod
def item_id(item):
return item["threadId"] if LynxChanHelper.item_type(item) == "thread" else item["postId"]
@staticmethod
def item_mtime(item):
return datetime.fromisoformat(item["creation"]).timestamp()
def item_urls(self, item, board):
return [
urljoin(self._base_url, im["path"])
for im in item["files"]
] if "files" in item and item["files"] else []
@staticmethod
def item_type(item):
return "thread" if "threadId" in item else "post"
def threads_url(self, board):
return "%s%s/1.json" % (self._base_url, board)
@staticmethod
def thread_mtime(thread):
return (thread["ommitedPosts"] if "ommitedPosts" in thread else 0) + len(thread["posts"])
@staticmethod
def parse_threads_list(r):
try:
j = json.loads(r.content.decode('utf-8', 'ignore'))
if len(j) == 0 or "threads" not in j:
logger.warning("No threads in response for %s: %s" % (r.url, r.text,))
return [], None
except JSONDecodeError:
logger.warning("JSONDecodeError for %s:" % (r.url,))
logger.warning(r.text)
return [], None
next_page = None
url = r.url[:r.url.rfind("?")] if "?" in r.url else r.url
current_page = int(url[url.rfind("/") + 1:-5])
if current_page < j["pageCount"]:
next_page = urljoin(r.url, "%d.json" % (current_page + 1))
return j["threads"], next_page
@staticmethod
def parse_thread(r):
try:
j = json.loads(r.content.decode('utf-8', 'ignore'))
except JSONDecodeError:
logger.warning("JSONDecodeError for %s:" % (r.url,))
logger.warning(r.text)
return []
all_items = []
for post in j["posts"]:
post["_parent"] = j["threadId"]
all_items.append(post)
del j["posts"]
all_items.append(j)
return all_items

View File

@ -1,4 +1,4 @@
from chan.json import JsonChanHelper
from chan.chan_json import JsonChanHelper
from post_process import get_links_from_body

View File

@ -7,4 +7,6 @@ influxdb
pika
bs4
urllib3
git+git://github.com/simon987/hexlib.git
git+git://github.com/simon987/hexlib.git
git+git://github.com/simon987/vanwanet_scrape.git
cloudscraper

10
run.py
View File

@ -102,14 +102,14 @@ class ChanState:
conn.commit()
def mark_visited(self, item: int, helper):
with sqlite3.connect(self._db) as conn:
with sqlite3.connect(self._db, timeout=10000) as conn:
conn.execute(
"INSERT INTO posts (post, chan) VALUES (?,?)",
(item, helper.db_id)
)
def has_visited(self, item: int, helper):
with sqlite3.connect(self._db) as conn:
with sqlite3.connect(self._db, timeout=10000) as conn:
cur = conn.cursor()
cur.execute(
"SELECT post FROM posts WHERE post=? AND chan=?",
@ -122,7 +122,7 @@ class ChanState:
if mtime == -1:
return True
with sqlite3.connect(self._db, timeout=5000) as conn:
with sqlite3.connect(self._db, timeout=10000) as conn:
cur = conn.cursor()
cur.execute(
"SELECT last_modified, ts FROM threads WHERE thread=? AND chan=?",
@ -134,7 +134,7 @@ class ChanState:
return False
def mark_thread_as_visited(self, thread, helper, board):
with sqlite3.connect(self._db, timeout=5000) as conn:
with sqlite3.connect(self._db, timeout=10000) as conn:
conn.execute(
"INSERT INTO threads (thread, last_modified, chan) "
"VALUES (?,?,?) "
@ -243,7 +243,7 @@ if __name__ == "__main__":
state = ChanState()
publish_q = Queue()
for _ in range(5):
for _ in range(10):
publish_thread = Thread(target=publish_worker, args=(publish_q, chan_helper, proxy))
publish_thread.setDaemon(True)
publish_thread.start()