refactor chan.py, update endchan, add doushio

This commit is contained in:
simon 2019-09-08 14:42:15 -04:00
parent 175d0bebaa
commit 18247d4139
15 changed files with 679 additions and 530 deletions

View File

@ -6,7 +6,7 @@ image boards and publishes serialised JSON to RabbitMQ
Compatible image boards: 4chan, lainchan, uboachan, Compatible image boards: 4chan, lainchan, uboachan,
22chan, wizchan, 1chan, 2ch.hk, endchan, 38chan, alokal, 22chan, wizchan, 1chan, 2ch.hk, endchan, 38chan, alokal,
horochan. horochan, doushio.
Can optionally push monitoring data to InfluxDB. Below is an Can optionally push monitoring data to InfluxDB. Below is an
example of Grafana being used to display it. example of Grafana being used to display it.

0
__init__.py Normal file
View File

527
chan.py
View File

@ -1,527 +0,0 @@
import json
from json import JSONDecodeError
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from post_process import get_links_from_body
from util import logger
class ChanHelper:
def __init__(self, db_id, base_url, image_url, thread_path, image_path, boards, rps):
self.db_id = db_id
self._base_url = base_url
self._image_url = image_url
self._thread_path = thread_path
self._image_path = image_path
self._boards = boards
self.rps = rps
def boards(self):
return [b for b in self._boards if not b.startswith("_")]
def image_url(self, board, tim, extension):
return "%s%s%s%s%s" % (self._image_url, board, self._image_path, tim, extension)
def threads_url(self, board):
return "%s%s/threads.json" % (self._base_url, board)
def posts_url(self, board, thread):
return "%s%s%s%d.json" % (self._base_url, board, self._thread_path, thread)
def board_hash(self, board):
return str((self._boards.index(board) + 1) * 10000)
@staticmethod
def item_id(item):
raise NotImplementedError
@staticmethod
def item_mtime(item):
raise NotImplementedError
def item_unique_id(self, item, board):
return int(self.board_hash(board) + str(self.item_id(item)))
@staticmethod
def thread_mtime(thread):
raise NotImplementedError
def item_urls(self, item, board):
raise NotImplementedError
@staticmethod
def item_type(item):
raise NotImplementedError
@staticmethod
def parse_threads_list(r):
raise NotImplementedError
@staticmethod
def parse_thread(r):
raise NotImplementedError
class HtmlChanHelper(ChanHelper):
def threads_url(self, board):
return "%s%s/" % (self._base_url, board)
def posts_url(self, board, thread):
return "%s%s%s%d.html" % (self._base_url, board, self._thread_path, thread)
@staticmethod
def item_id(item):
return item["id"]
def item_urls(self, item, board):
return []
@staticmethod
def item_type(item):
return item["type"]
@staticmethod
def thread_mtime(thread):
return -1
@staticmethod
def item_mtime(item):
return 0 # TODO
def parse_threads_list(self, r):
soup = BeautifulSoup(r.text, "html.parser")
threads = []
for threadEl in soup.find_all("div", attrs={"class": "opCell"}):
threads.append({
"id": int(threadEl.get("id")),
})
next_url = soup.find("a", attrs={"id": "linkNext"})
if next_url:
return threads, urljoin(r.url, next_url.get("href"))
return threads, None
@staticmethod
def parse_thread(r):
soup = BeautifulSoup(r.text, "html.parser")
op_el = soup.find("div", attrs={"class": "innerOP"})
yield {
"id": int(soup.find("div", class_="opCell").get("id")),
"type": "thread",
"html": str(op_el),
}
for post_el in soup.find_all("div", class_="postCell"):
yield {
"id": int(post_el.get("id")),
"type": "post",
"html": str(post_el),
}
class JsonChanHelper(ChanHelper):
@staticmethod
def item_id(item):
return item["no"]
@staticmethod
def item_mtime(item):
return item["time"]
def item_urls(self, item, board):
urls = set()
if "com" in item and item["com"]:
urls.update(get_links_from_body(item["com"]))
elif "sub" in item and item["sub"]:
urls.update(get_links_from_body(item["sub"]))
if "fsize" in item and item["fsize"]:
urls.add(self.image_url(board, item["tim"], item["ext"]))
return list(urls)
@staticmethod
def item_type(item):
return "thread" if "sub" in item else "post"
@staticmethod
def thread_mtime(thread):
return thread["last_modified"]
@staticmethod
def parse_threads_list(r):
try:
j = json.loads(r.text)
if len(j) == 0 or "threads" not in j[0]:
logger.warning("No threads in response for %s: %s" % (r.url, r.text,))
return [], None
except JSONDecodeError:
logger.warning("JSONDecodeError for %s:" % (r.url,))
logger.warning(r.text)
return [], None
threads = []
for page in j:
for thread in page["threads"]:
threads.append(thread)
return threads, None
@staticmethod
def parse_thread(r):
j = json.loads(r.text)
return j["posts"]
class RussianJsonChanHelper(ChanHelper):
@staticmethod
def item_id(item):
return int(item["num"])
@staticmethod
def item_mtime(item):
return item["timestamp"]
@staticmethod
def parse_threads_list(r):
try:
j = json.loads(r.text)
except JSONDecodeError:
logger.warning("JSONDecodeError for %s:" % (r.url,))
logger.warning(r.text)
return [], None
return j["threads"], None
@staticmethod
def parse_thread(r):
j = json.loads(r.text)
for thread in j["threads"]:
for post in thread["posts"]:
yield post
@staticmethod
def thread_mtime(thread):
return thread["posts_count"]
@staticmethod
def item_type(item):
return "thread" if "subject" in item and item["subject"] != "" else "post"
def item_urls(self, item, board):
urls = set()
if "comment" in item and item["comment"]:
urls.update(get_links_from_body(item["comment"]))
elif "subject" in item and item["subject"]:
urls.update(get_links_from_body(item["subject"]))
for file in item["files"]:
urls.add(self._base_url.rstrip("/") + file["path"])
return list(urls)
class AlokalJsonChanHelper(JsonChanHelper):
def item_urls(self, item, board):
urls = set()
if "com" in item and item["com"]:
urls.update(get_links_from_body(item["com"]))
elif "sub" in item and item["sub"]:
urls.update(get_links_from_body(item["sub"]))
if "fsize" in item and item["fsize"]:
urls.add(self._image_url + self._image_path + item["tim"] + "/" + str(item["no"]) + item["ext"])
return list(urls)
class SynchJsonChanHelper(JsonChanHelper):
def item_urls(self, item, board):
urls = set()
if "com" in item and item["com"]:
urls.update(get_links_from_body(item["com"]))
elif "sub" in item and item["sub"]:
urls.update(get_links_from_body(item["sub"]))
if "fsize" in item and item["fsize"]:
urls.add(self._image_url + self._image_path + item["tim"] + item["ext"])
return list(urls)
class MayuriChanHelper(ChanHelper):
def __init__(self, db_id, base_url, image_url, boards, rps):
super().__init__(db_id, base_url, image_url, None, None, boards, rps)
@staticmethod
def item_id(item):
return item["id"]
@staticmethod
def item_mtime(item):
return item["timestamp"]
@staticmethod
def thread_mtime(thread):
return thread["replies_count"]
def item_urls(self, item, board):
urls = set()
if "message" in item and item["message"]:
urls.update(get_links_from_body(item["message"]))
elif "subject" in item and item["subject"]:
urls.update(get_links_from_body(item["subject"]))
if item["files"]:
for file in item["files"]:
urls.add(self._image_url % file["storage"] + file["name"] + "." + file["ext"])
return list(urls)
@staticmethod
def item_type(item):
return "thread" if "replies_count" in item else "post"
def parse_threads_list(self, r):
try:
j = json.loads(r.text)
except JSONDecodeError:
logger.warning("JSONDecodeError for %s:" % (r.url,))
logger.warning(r.text)
return [], None
if j["currentPage"] < j["totalPages"]:
return j["data"], self._base_url + "boards/%d" % (j["currentPage"] + 1, )
return j["data"]
@staticmethod
def parse_thread(r):
try:
j = json.loads(r.text)
except JSONDecodeError:
logger.warning("JSONDecodeError for %s:" % (r.url,))
logger.warning(r.text)
return []
thread = dict(j["data"])
del thread["replies"]
yield thread
if j["data"]["replies"]:
for post in j["data"]["replies"]:
yield post
def threads_url(self, board):
return "%sboards/1" % (self._base_url, )
def posts_url(self, board, thread):
return "%sthreads/%d" % (self._base_url, thread)
CHANS = {
"4chan": JsonChanHelper(
1,
"https://a.4cdn.org/",
"https://i.4cdn.org/",
"/thread/",
"/",
(
"a", "b", "c", "d", "e", "f", "g", "gif", "h", "hr",
"k", "m", "o", "p", "r", "s", "t", "u", "v", "vg",
"vr", "w", "wg", "i", "ic", "r9k", "s4s", "vip", "qa",
"cm", "hm", "lgbt", "y", "3", "aco", "adv", "an", "asp",
"bant", "biz", "cgl", "ck", "co", "diy", "fa", "fit",
"gd", "hc", "his", "int", "jp", "lit", "mlp", "mu", "n",
"news", "out", "po", "pol", "qst", "sci", "soc", "sp",
"tg", "toy", "trv", "tv", "vp", "wsg", "wsr", "x"
),
rps=2
),
"lainchan": JsonChanHelper(
2,
"https://lainchan.org/",
"https://lainchan.org/",
"/res/",
"/src/",
(
"λ", "diy", "sec", "tech", "inter", "lit", "music", "vis",
"hum", "drg", "zzz", "layer", "q", "r", "_cult", "_psy",
"_mega",
),
rps=1 / 60
),
"uboachan": JsonChanHelper(
3,
"https://uboachan.net/",
"https://uboachan.net/",
"/res/",
"/src/",
(
"yn", "yndd", "fg", "yume", "o", "lit", "media", "og",
"ig", "2", "ot", "hikki", "cc", "x", "sugg"
),
rps=1 / 120
),
"22chan": JsonChanHelper(
4,
"https://22chan.org/",
"https://22chan.org/",
"/res/",
"/src/",
(
"a", "b", "f", "yu", "i", "k", "mu", "pol", "sewers",
"sg", "t", "vg"
),
rps=1 / 120
),
"wizchan": JsonChanHelper(
5,
"https://wizchan.org/",
"https://wizchan.org/",
"/res/",
"/src/",
(
"wiz", "dep", "hob", "lounge", "jp", "meta", "games", "music",
),
rps=1 / 30
),
# TODO
"1chan": ChanHelper(
6,
"https://www.1chan.net/",
"https://www.1chan.net/",
"/res/",
"/src/",
(
"rails"
),
rps=1 / 600
),
"2chhk": RussianJsonChanHelper(
7,
"https://2ch.hk/",
"https://2ch.hk/",
"/res/",
"/src/",
(
"d", "b", "o", "soc", "media", "r", "api", "rf", "int",
"po", "news", "hry", "au", "bi", "biz", "bo", "c", "em",
"fa", "fiz", "fl", "ftb", "hh", "hi", "me", "mg", "mlp",
"mo", "mov", "mu", "ne", "psy", "re",
"sci", "sf", "sn", "sp", "spc", "tv", "un", "w", "wh",
"wm", "wp", "zog", "de", "di", "diy", "mus", "pa", "p",
"wrk", "trv", "gd", "hw", "mobi", "pr", "ra", "s", "t",
"web", "bg", "cg", "gsg", "ruvn", "tes", "v", "vg", "wr",
"a", "fd", "ja", "ma", "vn", "fg", "fur", "gg", "ga",
"vape", "h", "ho", "hc", "e", "fet", "sex", "fag"
),
rps=1
),
"endchan": HtmlChanHelper(
8,
"https://endchan.net/",
"https://endchan.net/",
"/res/",
"/.media/",
(
"art", "film", "oekaki", "draw",
"adv", "r9k", "hope", "spoon",
"a", "am", "amr", "l", "monster", "m", "2hu", "animach",
"b", "webm", "v", "vvv", "vidya", "tg", "otomad", "mu",
"metal", "tv", "f", "clipuploads",
"4", "deutsch", "j", "jp" "italia", "fr", "kc", "kurenai", "int",
"intl", "lang", "librejp", "rzabczan", "55chan",
"pol", "pdfs", "his", "ggrevols", "horror", "aethism",
"tech", "g", "markov", "os", "agdg", "cyber", "HTML", "2600",
"ausneets", "qanonresearch", "polru", "yuri", "christianity",
"kc", "rapport", "news", "brit", "webm", "4chon"
),
rps=1
),
"38chan": JsonChanHelper(
9,
"http://38chan.net/",
"http://38chan.net/",
"/res/",
"/src/",
(
"a", "b", "g", "38"
),
rps=1 / 600
),
"alokal": AlokalJsonChanHelper(
10,
"https://alokal.eu/",
"https://alokal.eu/",
"/",
"src/",
(
"b", "pol", "sk", "int", "slav", "s", "gv", "mda", "sp",
"fit", "had",
),
rps=1 / 4
),
"gnfos": JsonChanHelper(
11,
"https://gnfos.com/",
"https://gnfos.com/",
"/res/",
"/src/",
(
"jp", "drive"
),
rps=1 / 60
),
"synch": SynchJsonChanHelper(
12,
"https://syn-ch.ru/",
"https://cdn.syn-ch.ru/",
"/res/",
"src",
(
"b", "d", "_r", "a", "_g", "mlp", "mu", "_tv", "vg",
"_wh", "old", "test"
),
rps=1 / 120
),
"tahta": JsonChanHelper(
13,
"https://tahta.ch/",
"https://tahta.ch/",
"/res/",
"/src/",
(
"b", "g", "s", "v"
),
rps=1 / 300
),
"awsumchan": JsonChanHelper(
14,
"https://awsumchan.org/",
"https://awsumchan.org/",
"/res/",
"/src/",
(
"an", "aw", "cr", "fi", "ra", "au", "ga", "he", "sp"
),
rps=1 / 600
),
"horochan": MayuriChanHelper(
15,
"https://api.horochan.ru/v1/",
"https://%s.horochan.ru/src/",
(
"b"
),
rps=1/20
),
}

0
chan/__init__.py Normal file
View File

17
chan/alokal_json.py Normal file
View File

@ -0,0 +1,17 @@
from chan.json import JsonChanHelper
from post_process import get_links_from_body
class AlokalJsonChanHelper(JsonChanHelper):
def item_urls(self, item, board):
urls = set()
if "com" in item and item["com"]:
urls.update(get_links_from_body(item["com"]))
elif "sub" in item and item["sub"]:
urls.update(get_links_from_body(item["sub"]))
if "fsize" in item and item["fsize"]:
urls.add(self._image_url + self._image_path + item["tim"] + "/" + str(item["no"]) + item["ext"])
return list(urls)

217
chan/chan.py Normal file
View File

@ -0,0 +1,217 @@
from chan.alokal_json import AlokalJsonChanHelper
from chan.doushio_html import DoushioHtmlChanHelper
from chan.endchan_html import EndchanHtmlChanHelper
from chan.json import JsonChanHelper
from chan.mayuri import MayuriChanHelper
from chan.russian_json import RussianJsonChanHelper
from chan.synch_json import SynchJsonChanHelper
CHANS = {
"4chan": JsonChanHelper(
1,
"https://a.4cdn.org/",
"https://i.4cdn.org/",
"/thread/",
"/",
(
"a", "b", "c", "d", "e", "f", "g", "gif", "h", "hr",
"k", "m", "o", "p", "r", "s", "t", "u", "v", "vg",
"vr", "w", "wg", "i", "ic", "r9k", "s4s", "vip", "qa",
"cm", "hm", "lgbt", "y", "3", "aco", "adv", "an", "asp",
"bant", "biz", "cgl", "ck", "co", "diy", "fa", "fit",
"gd", "hc", "his", "int", "jp", "lit", "mlp", "mu", "n",
"news", "out", "po", "pol", "qst", "sci", "soc", "sp",
"tg", "toy", "trv", "tv", "vp", "wsg", "wsr", "x"
),
rps=2
),
"lainchan": JsonChanHelper(
2,
"https://lainchan.org/",
"https://lainchan.org/",
"/res/",
"/src/",
(
"λ", "diy", "sec", "tech", "inter", "lit", "music", "vis",
"hum", "drg", "zzz", "layer", "q", "r", "_cult", "_psy",
"_mega",
),
rps=1 / 60
),
"uboachan": JsonChanHelper(
3,
"https://uboachan.net/",
"https://uboachan.net/",
"/res/",
"/src/",
(
"yn", "yndd", "fg", "yume", "o", "lit", "media", "og",
"ig", "2", "ot", "hikki", "cc", "x", "sugg"
),
rps=1 / 120
),
"22chan": JsonChanHelper(
4,
"https://22chan.org/",
"https://22chan.org/",
"/res/",
"/src/",
(
"a", "b", "f", "yu", "i", "k", "mu", "pol", "sewers",
"sg", "t", "vg"
),
rps=1 / 120
),
"wizchan": JsonChanHelper(
5,
"https://wizchan.org/",
"https://wizchan.org/",
"/res/",
"/src/",
(
"wiz", "dep", "hob", "lounge", "jp", "meta", "games", "music",
),
rps=1 / 30
),
# TODO
# "1chan": ChanHelper(
# 6,
# "https://www.1chan.net/",
# "https://www.1chan.net/",
# "/res/",
# "/src/",
# (
# "rails"
# ),
# rps=1 / 600
# ),
"2chhk": RussianJsonChanHelper(
7,
"https://2ch.hk/",
"https://2ch.hk/",
"/res/",
"/src/",
(
"d", "b", "o", "soc", "media", "r", "api", "rf", "int",
"po", "news", "hry", "au", "bi", "biz", "bo", "c", "em",
"fa", "fiz", "fl", "ftb", "hh", "hi", "me", "mg", "mlp",
"mo", "mov", "mu", "ne", "psy", "re",
"sci", "sf", "sn", "sp", "spc", "tv", "un", "w", "wh",
"wm", "wp", "zog", "de", "di", "diy", "mus", "pa", "p",
"wrk", "trv", "gd", "hw", "mobi", "pr", "ra", "s", "t",
"web", "bg", "cg", "gsg", "ruvn", "tes", "v", "vg", "wr",
"a", "fd", "ja", "ma", "vn", "fg", "fur", "gg", "ga",
"vape", "h", "ho", "hc", "e", "fet", "sex", "fag"
),
rps=1
),
"endchan": EndchanHtmlChanHelper(
8,
"https://endchan.net/",
"https://endchan.net/",
"/res/",
"/.media/",
(
"art", "film", "oekaki", "draw",
"adv", "r9k", "hope", "spoon",
"a", "am", "amr", "l", "monster", "m", "2hu", "animach",
"b", "webm", "v", "vvv", "vidya", "tg", "otomad", "mu",
"metal", "tv", "f", "clipuploads",
"4", "deutsch", "j", "jp" "italia", "fr", "kc", "kurenai", "int",
"intl", "lang", "librejp", "rzabczan", "55chan",
"pol", "pdfs", "his", "ggrevols", "horror", "aethism",
"tech", "g", "markov", "os", "agdg", "cyber", "HTML", "2600",
"ausneets", "qanonresearch", "polru", "yuri", "christianity",
"kc", "rapport", "news", "brit", "webm", "4chon"
),
rps=1
),
"38chan": JsonChanHelper(
9,
"http://38chan.net/",
"http://38chan.net/",
"/res/",
"/src/",
(
"a", "b", "g", "38"
),
rps=1 / 600
),
"alokal": AlokalJsonChanHelper(
10,
"https://alokal.eu/",
"https://alokal.eu/",
"/",
"src/",
(
"b", "pol", "sk", "int", "slav", "s", "gv", "mda", "sp",
"fit", "had",
),
rps=1 / 4
),
"gnfos": JsonChanHelper(
11,
"https://gnfos.com/",
"https://gnfos.com/",
"/res/",
"/src/",
(
"jp", "drive"
),
rps=1 / 60
),
"synch": SynchJsonChanHelper(
12,
"https://syn-ch.ru/",
"https://cdn.syn-ch.ru/",
"/res/",
"src",
(
"b", "d", "_r", "a", "_g", "mlp", "mu", "_tv", "vg",
"_wh", "old", "test"
),
rps=1 / 120
),
"tahta": JsonChanHelper(
13,
"https://tahta.ch/",
"https://tahta.ch/",
"/res/",
"/src/",
(
"b", "g", "s", "v"
),
rps=1 / 300
),
"awsumchan": JsonChanHelper(
14,
"https://awsumchan.org/",
"https://awsumchan.org/",
"/res/",
"/src/",
(
"an", "aw", "cr", "fi", "ra", "au", "ga", "he", "sp"
),
rps=1 / 600
),
"horochan": MayuriChanHelper(
15,
"https://api.horochan.ru/v1/",
"https://%s.horochan.ru/src/",
(
"b",
),
rps=1 / 20
),
"doushio": DoushioHtmlChanHelper(
16,
"http://doushio.com/",
"http://doushio.com/",
"",
"/ass/",
(
"moe",
),
rps=1
)
}

75
chan/doushio_html.py Normal file
View File

@ -0,0 +1,75 @@
from urllib.parse import urljoin
from dateutil import parser
from bs4 import BeautifulSoup
from chan.helper import ChanHelper
from post_process import get_links_from_html_body
class DoushioHtmlChanHelper(ChanHelper):
def threads_url(self, board):
return "%s%s/" % (self._base_url, board)
def posts_url(self, board, thread):
return "%s%s/%d" % (self._base_url, board, thread)
@staticmethod
def item_id(item):
return item["id"]
def item_urls(self, item, board):
return list(set(get_links_from_html_body(item["html"], self._base_url)))
@staticmethod
def item_type(item):
return item["type"]
@staticmethod
def thread_mtime(thread):
return -1
@staticmethod
def item_mtime(item):
return item["time"]
def parse_threads_list(self, r):
soup = BeautifulSoup(r.text, "html.parser")
threads = []
for threadEl in soup.find_all("section"):
threads.append({
"id": int(threadEl.get("id")),
})
next_url = soup.find("link", attrs={"rel": "next"})
if next_url:
return threads, urljoin(r.url, next_url.get("href"))
return threads, None
@staticmethod
def parse_thread(r):
soup = BeautifulSoup(r.text, "html.parser")
op_el = soup.find("section")
for post_el in op_el.find_all("article"):
yield {
"id": int(post_el.get("id")),
"type": "post",
"html": str(post_el),
"time": int(parser.parse(post_el.find("header").find("time").get("datetime")).timestamp())
}
post_el.decompose()
yield {
"id": int(op_el.get("id")),
"type": "thread",
"html": str(op_el),
"time": int(parser.parse(op_el.find("header").find("time").get("datetime")).timestamp())
}

74
chan/endchan_html.py Normal file
View File

@ -0,0 +1,74 @@
import datetime
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from chan.helper import ChanHelper
from post_process import get_links_from_html_body
class EndchanHtmlChanHelper(ChanHelper):
def threads_url(self, board):
return "%s%s/" % (self._base_url, board)
def posts_url(self, board, thread):
return "%s%s%s%d.html" % (self._base_url, board, self._thread_path, thread)
@staticmethod
def item_id(item):
return item["id"]
def item_urls(self, item, board):
return list(set(get_links_from_html_body(item["html"], self._base_url)))
@staticmethod
def item_type(item):
return item["type"]
@staticmethod
def thread_mtime(thread):
return -1
@staticmethod
def item_mtime(item):
return item["time"]
def parse_threads_list(self, r):
soup = BeautifulSoup(r.text, "html.parser")
threads = []
for threadEl in soup.find_all("div", attrs={"class": "opCell"}):
threads.append({
"id": int(threadEl.get("id")),
})
next_url = soup.find("a", attrs={"id": "linkNext"})
if next_url:
return threads, urljoin(r.url, next_url.get("href"))
return threads, None
@staticmethod
def parse_thread(r):
soup = BeautifulSoup(r.text, "html.parser")
op_el = soup.find("div", attrs={"class": "innerOP"})
if not op_el:
return []
yield {
"id": int(soup.find("div", class_="opCell").get("id")),
"type": "thread",
"html": str(op_el),
"time": int(datetime.datetime.strptime(op_el.find("span", class_="labelCreated").text,
"%m/%d/%Y (%a) %H:%M:%S").timestamp())
}
for post_el in soup.find_all("div", class_="postCell"):
yield {
"id": int(post_el.get("id")),
"type": "post",
"html": str(post_el),
"time": int(datetime.datetime.strptime(post_el.find("span", class_="labelCreated").text,
"%m/%d/%Y (%a) %H:%M:%S").timestamp())
}

75
chan/helper.py Normal file
View File

@ -0,0 +1,75 @@
from bs4 import BeautifulSoup
class ChanHelper:
def __init__(self, db_id, base_url, image_url, thread_path, image_path, boards, rps):
self.db_id = db_id
self._base_url = base_url
self._image_url = image_url
self._thread_path = thread_path
self._image_path = image_path
self._boards = boards
self.rps = rps
def boards(self):
return [b for b in self._boards if not b.startswith("_")]
def image_url(self, board, tim, extension):
return "%s%s%s%s%s" % (self._image_url, board, self._image_path, tim, extension)
def threads_url(self, board):
return "%s%s/threads.json" % (self._base_url, board)
def posts_url(self, board, thread):
return "%s%s%s%d.json" % (self._base_url, board, self._thread_path, thread)
def board_hash(self, board):
return str((self._boards.index(board) + 1) * 10000)
@staticmethod
def item_id(item):
raise NotImplementedError
@staticmethod
def item_mtime(item):
raise NotImplementedError
def item_unique_id(self, item, board):
return int(self.board_hash(board) + str(self.item_id(item)))
@staticmethod
def thread_mtime(thread):
raise NotImplementedError
def item_urls(self, item, board):
raise NotImplementedError
@staticmethod
def item_type(item):
raise NotImplementedError
@staticmethod
def parse_threads_list(r):
raise NotImplementedError
@staticmethod
def parse_thread(r):
raise NotImplementedError
@staticmethod
def parse_thread(r):
soup = BeautifulSoup(r.text, "html.parser")
op_el = soup.find("div", attrs={"class": "innerOP"})
yield {
"id": int(soup.find("div", class_="opCell").get("id")),
"type": "thread",
"html": str(op_el),
}
for post_el in soup.find_all("div", class_="postCell"):
yield {
"id": int(post_el.get("id")),
"type": "post",
"html": str(post_el),
}

60
chan/json.py Normal file
View File

@ -0,0 +1,60 @@
import json
from json import JSONDecodeError
from chan.helper import ChanHelper
from post_process import get_links_from_body
from util import logger
class JsonChanHelper(ChanHelper):
@staticmethod
def item_id(item):
return item["no"]
@staticmethod
def item_mtime(item):
return item["time"]
def item_urls(self, item, board):
urls = set()
if "com" in item and item["com"]:
urls.update(get_links_from_body(item["com"]))
elif "sub" in item and item["sub"]:
urls.update(get_links_from_body(item["sub"]))
if "fsize" in item and item["fsize"]:
urls.add(self.image_url(board, item["tim"], item["ext"]))
return list(urls)
@staticmethod
def item_type(item):
return "thread" if "sub" in item else "post"
@staticmethod
def thread_mtime(thread):
return thread["last_modified"]
@staticmethod
def parse_threads_list(r):
try:
j = json.loads(r.text)
if len(j) == 0 or "threads" not in j[0]:
logger.warning("No threads in response for %s: %s" % (r.url, r.text,))
return [], None
except JSONDecodeError:
logger.warning("JSONDecodeError for %s:" % (r.url,))
logger.warning(r.text)
return [], None
threads = []
for page in j:
for thread in page["threads"]:
threads.append(thread)
return threads, None
@staticmethod
def parse_thread(r):
j = json.loads(r.text)
return j["posts"]

75
chan/mayuri.py Normal file
View File

@ -0,0 +1,75 @@
import json
from json import JSONDecodeError
from chan.helper import ChanHelper
from post_process import get_links_from_body
from util import logger
class MayuriChanHelper(ChanHelper):
def __init__(self, db_id, base_url, image_url, boards, rps):
super().__init__(db_id, base_url, image_url, None, None, boards, rps)
@staticmethod
def item_id(item):
return item["id"]
@staticmethod
def item_mtime(item):
return item["timestamp"]
@staticmethod
def thread_mtime(thread):
return thread["replies_count"]
def item_urls(self, item, board):
urls = set()
if "message" in item and item["message"]:
urls.update(get_links_from_body(item["message"]))
elif "subject" in item and item["subject"]:
urls.update(get_links_from_body(item["subject"]))
if item["files"]:
for file in item["files"]:
urls.add(self._image_url % file["storage"] + file["name"] + "." + file["ext"])
return list(urls)
@staticmethod
def item_type(item):
return "thread" if "replies_count" in item else "post"
def parse_threads_list(self, r):
try:
j = json.loads(r.text)
except JSONDecodeError:
logger.warning("JSONDecodeError for %s:" % (r.url,))
logger.warning(r.text)
return [], None
if j["currentPage"] < j["totalPages"]:
return j["data"], self._base_url + "boards/%d" % (j["currentPage"] + 1,)
return j["data"]
@staticmethod
def parse_thread(r):
try:
j = json.loads(r.text)
except JSONDecodeError:
logger.warning("JSONDecodeError for %s:" % (r.url,))
logger.warning(r.text)
return []
thread = dict(j["data"])
del thread["replies"]
yield thread
if j["data"]["replies"]:
for post in j["data"]["replies"]:
yield post
def threads_url(self, board):
return "%sboards/1" % (self._base_url,)
def posts_url(self, board, thread):
return "%sthreads/%d" % (self._base_url, thread)

55
chan/russian_json.py Normal file
View File

@ -0,0 +1,55 @@
import json
from json import JSONDecodeError
from chan.helper import ChanHelper
from post_process import get_links_from_body
from util import logger
class RussianJsonChanHelper(ChanHelper):
@staticmethod
def item_id(item):
return int(item["num"])
@staticmethod
def item_mtime(item):
return item["timestamp"]
@staticmethod
def parse_threads_list(r):
try:
j = json.loads(r.text)
except JSONDecodeError:
logger.warning("JSONDecodeError for %s:" % (r.url,))
logger.warning(r.text)
return [], None
return j["threads"], None
@staticmethod
def parse_thread(r):
j = json.loads(r.text)
for thread in j["threads"]:
for post in thread["posts"]:
yield post
@staticmethod
def thread_mtime(thread):
return thread["posts_count"]
@staticmethod
def item_type(item):
return "thread" if "subject" in item and item["subject"] != "" else "post"
def item_urls(self, item, board):
urls = set()
if "comment" in item and item["comment"]:
urls.update(get_links_from_body(item["comment"]))
elif "subject" in item and item["subject"]:
urls.update(get_links_from_body(item["subject"]))
for file in item["files"]:
urls.add(self._base_url.rstrip("/") + file["path"])
return list(urls)

18
chan/synch_json.py Normal file
View File

@ -0,0 +1,18 @@
from chan.json import JsonChanHelper
from post_process import get_links_from_body
class SynchJsonChanHelper(JsonChanHelper):
def item_urls(self, item, board):
urls = set()
if "com" in item and item["com"]:
urls.update(get_links_from_body(item["com"]))
elif "sub" in item and item["sub"]:
urls.update(get_links_from_body(item["sub"]))
if "fsize" in item and item["fsize"]:
urls.add(self._image_url + self._image_path + item["tim"] + item["ext"])
return list(urls)

View File

@ -3,6 +3,7 @@ import hashlib
import re import re
import zlib import zlib
from io import BytesIO from io import BytesIO
from urllib.parse import urljoin
import imagehash import imagehash
from PIL import Image from PIL import Image
@ -10,6 +11,7 @@ from PIL import Image
from util import logger from util import logger
LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))") LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))")
HTML_HREF_RE = re.compile(r"href=\"([^\"]+)\"")
IMAGE_FILETYPES = ( IMAGE_FILETYPES = (
# :orig for twitter cdn # :orig for twitter cdn
@ -71,7 +73,7 @@ def image_meta(url, url_idx, web):
def post_process(item, board, helper, web): def post_process(item, board, helper, web):
item["_v"] = 1.4 item["_v"] = 1.5
item["_id"] = helper.item_unique_id(item, board) item["_id"] = helper.item_unique_id(item, board)
item["_board"] = board item["_board"] = board
@ -100,5 +102,13 @@ def get_links_from_body(body):
return result return result
def get_links_from_html_body(body, base_url):
result = []
for match in HTML_HREF_RE.finditer(body):
url = match.group(1)
result.append(urljoin(base_url, url))
return result
def is_external(url): def is_external(url):
return not url.startswith(("#", "/")) return not url.startswith(("#", "/"))

2
run.py
View File

@ -10,7 +10,7 @@ from threading import Thread
import pika import pika
import monitoring import monitoring
from chan import CHANS from chan.chan import CHANS
from post_process import post_process from post_process import post_process
from util import logger, Web from util import logger, Web