mirror of
https://github.com/simon987/chan_feed.git
synced 2025-04-10 14:06:42 +00:00
rename meta attributes, add 2ch.hk support, version bump
This commit is contained in:
parent
2890222c4d
commit
9447463e56
4
.gitignore
vendored
4
.gitignore
vendored
@ -2,5 +2,5 @@
|
|||||||
__pychache__/
|
__pychache__/
|
||||||
*.pyc
|
*.pyc
|
||||||
*.iml
|
*.iml
|
||||||
*.db
|
*.log
|
||||||
*.log
|
state.db*
|
||||||
|
@ -5,7 +5,7 @@ image boards and publishes serialised JSON to RabbitMQ
|
|||||||
for real-time ingest.
|
for real-time ingest.
|
||||||
|
|
||||||
Compatible image boards: 4chan, lainchan, uboachan,
|
Compatible image boards: 4chan, lainchan, uboachan,
|
||||||
22chan, wizchan, 1chan.
|
22chan, wizchan, 1chan, 2ch.hk.
|
||||||
|
|
||||||
Can optionally push monitoring data to InfluxDB. Below is an
|
Can optionally push monitoring data to InfluxDB. Below is an
|
||||||
example of Grafana being used to display it.
|
example of Grafana being used to display it.
|
||||||
|
105
chan.py
105
chan.py
@ -1,3 +1,8 @@
|
|||||||
|
import json
|
||||||
|
|
||||||
|
from post_process import get_links_from_body
|
||||||
|
|
||||||
|
|
||||||
class ChanHelper:
|
class ChanHelper:
|
||||||
def __init__(self, db_id, base_url, image_url, thread_path, image_path, boards):
|
def __init__(self, db_id, base_url, image_url, thread_path, image_path, boards):
|
||||||
self.db_id = db_id
|
self.db_id = db_id
|
||||||
@ -16,6 +21,85 @@ class ChanHelper:
|
|||||||
def posts_url(self, board, thread):
|
def posts_url(self, board, thread):
|
||||||
return "%s%s%s%d.json" % (self._base_url, board, self._thread_path, thread)
|
return "%s%s%s%d.json" % (self._base_url, board, self._thread_path, thread)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def item_id(item):
|
||||||
|
return item["no"]
|
||||||
|
|
||||||
|
def item_urls(self, item, board):
|
||||||
|
urls = set()
|
||||||
|
|
||||||
|
if "com" in item and item["com"]:
|
||||||
|
urls.update(get_links_from_body(item["com"]))
|
||||||
|
elif "sub" in item and item["sub"]:
|
||||||
|
urls.update(get_links_from_body(item["sub"]))
|
||||||
|
if "fsize" in item and item["fsize"]:
|
||||||
|
urls.add(self.image_url(board, item["tim"], item["ext"]))
|
||||||
|
|
||||||
|
return list(urls)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def item_type(item):
|
||||||
|
return "thread" if "sub" in item else "post"
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def thread_mtime(thread):
|
||||||
|
return thread["last_modified"]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def parse_threads_list(content):
|
||||||
|
j = json.loads(content)
|
||||||
|
for page in j:
|
||||||
|
for thread in page["threads"]:
|
||||||
|
yield thread
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def parse_thread(content):
|
||||||
|
j = json.loads(content)
|
||||||
|
return j["posts"]
|
||||||
|
|
||||||
|
|
||||||
|
class RussianChanHelper(ChanHelper):
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def item_id(item):
|
||||||
|
return int(item["num"])
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def parse_threads_list(content):
|
||||||
|
j = json.loads(content)
|
||||||
|
return j["threads"]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def parse_thread(content):
|
||||||
|
j = json.loads(content)
|
||||||
|
for thread in j["threads"]:
|
||||||
|
for post in thread["posts"]:
|
||||||
|
yield post
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def thread_mtime(thread):
|
||||||
|
return thread["posts_count"]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def item_type(item):
|
||||||
|
return "thread" if "subject" in item and item["subject"] != "" else "post"
|
||||||
|
|
||||||
|
def item_urls(self, item, board):
|
||||||
|
urls = set()
|
||||||
|
|
||||||
|
if "comment" in item and item["comment"]:
|
||||||
|
urls.update(get_links_from_body(item["comment"]))
|
||||||
|
elif "subject" in item and item["subject"]:
|
||||||
|
urls.update(get_links_from_body(item["subject"]))
|
||||||
|
|
||||||
|
if urls:
|
||||||
|
print(list(urls))
|
||||||
|
|
||||||
|
for file in item["files"]:
|
||||||
|
urls.add(self._base_url + file["path"])
|
||||||
|
|
||||||
|
return list(urls)
|
||||||
|
|
||||||
|
|
||||||
CHANS = {
|
CHANS = {
|
||||||
"4chan": ChanHelper(
|
"4chan": ChanHelper(
|
||||||
@ -87,6 +171,25 @@ CHANS = {
|
|||||||
"/src/",
|
"/src/",
|
||||||
[
|
[
|
||||||
"rails"
|
"rails"
|
||||||
]
|
],
|
||||||
|
),
|
||||||
|
"2chhk": RussianChanHelper(
|
||||||
|
7,
|
||||||
|
"https://2ch.hk/",
|
||||||
|
"https://2ch.hk/",
|
||||||
|
"/res/",
|
||||||
|
"/src/",
|
||||||
|
[
|
||||||
|
"d", "b", "o", "soc", "media", "r", "api", "rf", "int",
|
||||||
|
"po", "news", "hry", "au", "bi", "biz", "bo", "c", "em",
|
||||||
|
"fa", "fiz", "fl", "ftb", "hh", "hi", "me", "mg", "mlp",
|
||||||
|
"mo", "mov", "mu", "ne", "psy", "re",
|
||||||
|
"sci", "sf", "sn", "sp", "spc", "tv", "un", "w", "wh",
|
||||||
|
"wm", "wp", "zog", "de", "di", "diy", "mus", "pa", "p",
|
||||||
|
"wrk", "trv", "gd", "hw", "mobi", "pr", "ra", "s", "t",
|
||||||
|
"web", "bg", "cg", "gsg", "ruvn", "tes", "v", "vg", "wr",
|
||||||
|
"a", "fd", "ja", "ma", "vn", "fg", "fur", "gg", "ga",
|
||||||
|
"vape", "h", "ho", "hc", "e", "fet", "sex", "fag"
|
||||||
|
],
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
@ -3,31 +3,20 @@ import re
|
|||||||
LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))")
|
LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))")
|
||||||
|
|
||||||
|
|
||||||
def post_process(thing, board, helper):
|
def post_process(item, board, helper):
|
||||||
thing["v"] = 1.1
|
item["_v"] = 1.2
|
||||||
thing["_id"] = int(thing["no"])
|
item["_id"] = helper.item_id(item)
|
||||||
|
|
||||||
thing["board"] = board
|
item["_board"] = board
|
||||||
thing["chan"] = helper.db_id
|
item["_chan"] = helper.db_id
|
||||||
|
|
||||||
if "com" in thing and thing["com"]:
|
item["_urls"] = helper.item_urls(item, board)
|
||||||
thing["urls"] = get_links_from_body(thing["com"])
|
|
||||||
elif "sub" in thing and thing["sub"]:
|
|
||||||
thing["urls"] = get_links_from_body(thing["sub"])
|
|
||||||
if "fsize" in thing and thing["fsize"]:
|
|
||||||
url = helper.image_url(board, thing["tim"], thing["ext"])
|
|
||||||
if "urls" in thing:
|
|
||||||
thing["urls"].append(url)
|
|
||||||
else:
|
|
||||||
thing["urls"] = [url]
|
|
||||||
if "urls" not in thing:
|
|
||||||
thing["urls"] = []
|
|
||||||
|
|
||||||
return thing
|
return item
|
||||||
|
|
||||||
|
|
||||||
def get_links_from_body(body):
|
def get_links_from_body(body):
|
||||||
result = set()
|
result = []
|
||||||
|
|
||||||
body = body \
|
body = body \
|
||||||
.replace("<wbr>", "") \
|
.replace("<wbr>", "") \
|
||||||
@ -37,9 +26,9 @@ def get_links_from_body(body):
|
|||||||
for match in LINK_RE.finditer(body):
|
for match in LINK_RE.finditer(body):
|
||||||
url = match.group(1)
|
url = match.group(1)
|
||||||
if is_external(url):
|
if is_external(url):
|
||||||
result.add(url)
|
result.append(url)
|
||||||
|
|
||||||
return list(result)
|
return result
|
||||||
|
|
||||||
|
|
||||||
def is_external(url):
|
def is_external(url):
|
||||||
|
5
requirements.txt
Normal file
5
requirements.txt
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
requests
|
||||||
|
requests[socks]
|
||||||
|
stem
|
||||||
|
influxdb
|
||||||
|
pika
|
35
run.py
35
run.py
@ -23,27 +23,22 @@ class ChanScanner:
|
|||||||
self.helper = helper
|
self.helper = helper
|
||||||
self.state = ChanState()
|
self.state = ChanState()
|
||||||
|
|
||||||
def _fetch_threads(self, board):
|
def _threads(self, board):
|
||||||
r = self.web.get(self.helper.threads_url(board))
|
r = self.web.get(self.helper.threads_url(board))
|
||||||
if r.status_code == 200:
|
if r.status_code == 200:
|
||||||
return r.json()
|
return self.helper.parse_threads_list(r.text)
|
||||||
return []
|
return []
|
||||||
|
|
||||||
def _fetch_posts(self, board, thread):
|
def _fetch_posts(self, board, thread):
|
||||||
r = self.web.get(self.helper.posts_url(board, thread))
|
r = self.web.get(self.helper.posts_url(board, thread))
|
||||||
if r.status_code == 200:
|
if r.status_code == 200:
|
||||||
return r.json()
|
return self.helper.parse_thread(r.text)
|
||||||
return {"posts": []}
|
return []
|
||||||
|
|
||||||
def _threads(self, board):
|
|
||||||
for page in self._fetch_threads(board):
|
|
||||||
for thread in page["threads"]:
|
|
||||||
yield thread
|
|
||||||
|
|
||||||
def _posts(self, board):
|
def _posts(self, board):
|
||||||
for thread in sorted(self._threads(board), key=lambda x: x["no"]):
|
for thread in self._threads(board):
|
||||||
if self.state.has_new_posts(thread, self.helper):
|
if self.state.has_new_posts(thread, self.helper):
|
||||||
for post in sorted(self._fetch_posts(board, thread["no"])["posts"], key=lambda x: x["no"]):
|
for post in self._fetch_posts(board, self.helper.item_id(thread)):
|
||||||
yield post
|
yield post
|
||||||
self.state.mark_thread_as_visited(thread, self.helper)
|
self.state.mark_thread_as_visited(thread, self.helper)
|
||||||
|
|
||||||
@ -55,9 +50,9 @@ class ChanScanner:
|
|||||||
|
|
||||||
def once(func):
|
def once(func):
|
||||||
def wrapper(item, board, helper):
|
def wrapper(item, board, helper):
|
||||||
if not state.has_visited(item["no"], helper):
|
if not state.has_visited(helper.item_id(item), helper):
|
||||||
func(item, board, helper)
|
func(item, board, helper)
|
||||||
state.mark_visited(item["no"], helper)
|
state.mark_visited(helper.item_id(item), helper)
|
||||||
|
|
||||||
return wrapper
|
return wrapper
|
||||||
|
|
||||||
@ -110,10 +105,10 @@ class ChanState:
|
|||||||
cur = conn.cursor()
|
cur = conn.cursor()
|
||||||
cur.execute(
|
cur.execute(
|
||||||
"SELECT last_modified FROM threads WHERE thread=? AND chan=?",
|
"SELECT last_modified FROM threads WHERE thread=? AND chan=?",
|
||||||
(thread["no"], helper.db_id)
|
(helper.item_id(thread), helper.db_id)
|
||||||
)
|
)
|
||||||
row = cur.fetchone()
|
row = cur.fetchone()
|
||||||
if not row or thread["last_modified"] != row[0]:
|
if not row or helper.thread_mtime(thread) != row[0]:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@ -124,8 +119,8 @@ class ChanState:
|
|||||||
"VALUES (?,?,?) "
|
"VALUES (?,?,?) "
|
||||||
"ON CONFLICT (thread, chan) "
|
"ON CONFLICT (thread, chan) "
|
||||||
"DO UPDATE SET last_modified=?",
|
"DO UPDATE SET last_modified=?",
|
||||||
(thread["no"], thread["last_modified"], helper.db_id,
|
(helper.item_id(thread), helper.thread_mtime(thread), helper.db_id,
|
||||||
thread["last_modified"])
|
helper.thread_mtime(thread))
|
||||||
)
|
)
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
@ -144,7 +139,7 @@ def publish_worker(queue: Queue, helper):
|
|||||||
|
|
||||||
@once
|
@once
|
||||||
def publish(item, board, helper):
|
def publish(item, board, helper):
|
||||||
item_type = "thread" if "sub" in item else "post"
|
item_type = helper.item_type(item)
|
||||||
post_process(item, board, helper)
|
post_process(item, board, helper)
|
||||||
|
|
||||||
chan_channel.basic_publish(
|
chan_channel.basic_publish(
|
||||||
@ -154,9 +149,9 @@ def publish(item, board, helper):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if MONITORING:
|
if MONITORING:
|
||||||
distance = datetime.utcnow() - datetime.fromtimestamp(item["time"])
|
distance = datetime.utcnow() - datetime.fromtimestamp(helper.item_mtime(item))
|
||||||
monitoring.log([{
|
monitoring.log([{
|
||||||
"measurement": helper.db_id,
|
"measurement": chan,
|
||||||
"time": str(datetime.utcnow()),
|
"time": str(datetime.utcnow()),
|
||||||
"tags": {
|
"tags": {
|
||||||
"board": board
|
"board": board
|
||||||
|
Loading…
x
Reference in New Issue
Block a user