mirror of
https://github.com/simon987/chan_feed.git
synced 2025-04-20 10:36:43 +00:00
Initial commit
This commit is contained in:
commit
210d032703
6
.gitignore
vendored
Normal file
6
.gitignore
vendored
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
.idea/
|
||||||
|
__pychache__/
|
||||||
|
*.pyc
|
||||||
|
*.iml
|
||||||
|
*.db
|
||||||
|
*.log
|
13
README.md
Normal file
13
README.md
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
### chan_feed
|
||||||
|
|
||||||
|
Daemon that fetches posts from compatible *chan
|
||||||
|
image boards and publishes serialised JSON to RabbitMQ
|
||||||
|
for real-time ingest.
|
||||||
|
|
||||||
|
Compatible image boards: 4chan, lainchan, uboachan,
|
||||||
|
22chan, wizchan, 1chan.
|
||||||
|
|
||||||
|
Can optionally push monitoring data to InfluxDB. Below is an
|
||||||
|
example of Grafana being used to display it.
|
||||||
|
|
||||||
|

|
92
chan.py
Normal file
92
chan.py
Normal file
@ -0,0 +1,92 @@
|
|||||||
|
class ChanHelper:
|
||||||
|
def __init__(self, db_id, base_url, image_url, thread_path, image_path, boards):
|
||||||
|
self.db_id = db_id
|
||||||
|
self._base_url = base_url
|
||||||
|
self._image_url = image_url
|
||||||
|
self._thread_path = thread_path
|
||||||
|
self._image_path = image_path
|
||||||
|
self.boards = boards
|
||||||
|
|
||||||
|
def image_url(self, board, tim, extension):
|
||||||
|
return "%s%s%s%s%s" % (self._image_url, board, self._image_path, tim, extension)
|
||||||
|
|
||||||
|
def threads_url(self, board):
|
||||||
|
return "%s%s/threads.json" % (self._base_url, board)
|
||||||
|
|
||||||
|
def posts_url(self, board, thread):
|
||||||
|
return "%s%s%s%d.json" % (self._base_url, board, self._thread_path, thread)
|
||||||
|
|
||||||
|
|
||||||
|
CHANS = {
|
||||||
|
"4chan": ChanHelper(
|
||||||
|
1,
|
||||||
|
"https://a.4cdn.org/",
|
||||||
|
"https://i.4cdn.org/",
|
||||||
|
"/thread/",
|
||||||
|
"/",
|
||||||
|
[
|
||||||
|
"a", "b", "c", "d", "e", "f", "g", "gif", "h", "hr",
|
||||||
|
"k", "m", "o", "p", "r", "s", "t", "u", "v", "vg",
|
||||||
|
"vr", "w", "wg", "i", "ic", "r9k", "s4s", "vip", "qa",
|
||||||
|
"cm", "hm", "lgbt", "y", "3", "aco", "adv", "an", "asp",
|
||||||
|
"bant", "biz", "cgl", "ck", "co", "diy", "fa", "fit",
|
||||||
|
"gd", "hc", "his", "int", "jp", "lit", "mlp", "mu", "n",
|
||||||
|
"news", "out", "po", "pol", "qst", "sci", "soc", "sp",
|
||||||
|
"tg", "toy", "trv", "tv", "vp", "wsg", "wsr", "x"
|
||||||
|
]
|
||||||
|
),
|
||||||
|
"lainchan": ChanHelper(
|
||||||
|
2,
|
||||||
|
"https://lainchan.org/",
|
||||||
|
"https://lainchan.org/",
|
||||||
|
"/res/",
|
||||||
|
"/src/",
|
||||||
|
[
|
||||||
|
"λ", "diy", "sec", "tech", "inter", "lit", "music", "vis",
|
||||||
|
"hum", "drg", "zzz", "layer" "q", "r", "cult", "psy",
|
||||||
|
"mega", "random"
|
||||||
|
]
|
||||||
|
),
|
||||||
|
"uboachan": ChanHelper(
|
||||||
|
3,
|
||||||
|
"https://uboachan.net/",
|
||||||
|
"https://uboachan.net/",
|
||||||
|
"/res/",
|
||||||
|
"/src/",
|
||||||
|
[
|
||||||
|
"yn", "yndd", "fg", "yume", "o", "lit", "media", "og",
|
||||||
|
"ig", "2", "ot", "hikki", "cc", "x", "sugg"
|
||||||
|
]
|
||||||
|
),
|
||||||
|
"22chan": ChanHelper(
|
||||||
|
4,
|
||||||
|
"https://22chan.org/",
|
||||||
|
"https://22chan.org/",
|
||||||
|
"/res/",
|
||||||
|
"/src/",
|
||||||
|
[
|
||||||
|
"a", "b", "f", "feels", "i", "k", "mu", "pol", "sewers",
|
||||||
|
"sg", "t", "vg"
|
||||||
|
]
|
||||||
|
),
|
||||||
|
"wizchan": ChanHelper(
|
||||||
|
5,
|
||||||
|
"https://wizchan.org/",
|
||||||
|
"https://wizchan.org/",
|
||||||
|
"/res/",
|
||||||
|
"/src/",
|
||||||
|
[
|
||||||
|
"wiz", "dep", "hob", "lounge", "jp", "meta", "games", "music",
|
||||||
|
]
|
||||||
|
),
|
||||||
|
"1chan": ChanHelper(
|
||||||
|
6,
|
||||||
|
"https://www.1chan.net/",
|
||||||
|
"https://www.1chan.net/",
|
||||||
|
"/res/",
|
||||||
|
"/src/",
|
||||||
|
[
|
||||||
|
"rails"
|
||||||
|
]
|
||||||
|
)
|
||||||
|
}
|
BIN
monitoring.png
Normal file
BIN
monitoring.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 87 KiB |
18
monitoring.py
Normal file
18
monitoring.py
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
from influxdb import InfluxDBClient
|
||||||
|
|
||||||
|
client = InfluxDBClient("localhost", 8086, "root", "root", "chan_feed")
|
||||||
|
|
||||||
|
|
||||||
|
def init():
|
||||||
|
db_exists = False
|
||||||
|
for db in client.get_list_database():
|
||||||
|
if db["name"] == "chan_feed":
|
||||||
|
db_exists = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if not db_exists:
|
||||||
|
client.create_database("chan_feed")
|
||||||
|
|
||||||
|
|
||||||
|
def log(event):
|
||||||
|
client.write_points(event)
|
45
post_process.py
Normal file
45
post_process.py
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))")
|
||||||
|
|
||||||
|
|
||||||
|
def post_process(thing, board, helper):
|
||||||
|
thing["v"] = 1.0
|
||||||
|
|
||||||
|
thing["board"] = board
|
||||||
|
thing["chan"] = helper.db_id
|
||||||
|
|
||||||
|
if "com" in thing and thing["com"]:
|
||||||
|
thing["urls"] = get_links_from_body(thing["com"])
|
||||||
|
elif "sub" in thing and thing["sub"]:
|
||||||
|
thing["urls"] = get_links_from_body(thing["sub"])
|
||||||
|
if "fsize" in thing and thing["fsize"]:
|
||||||
|
url = helper.image_url(board, thing["tim"], thing["ext"])
|
||||||
|
if "urls" in thing:
|
||||||
|
thing["urls"].append(url)
|
||||||
|
else:
|
||||||
|
thing["urls"] = [url]
|
||||||
|
if "urls" not in thing:
|
||||||
|
thing["urls"] = []
|
||||||
|
|
||||||
|
return thing
|
||||||
|
|
||||||
|
|
||||||
|
def get_links_from_body(body):
|
||||||
|
result = set()
|
||||||
|
|
||||||
|
body = body \
|
||||||
|
.replace("<wbr>", "") \
|
||||||
|
.replace("</s>", "") \
|
||||||
|
.replace(" dot ", ".")
|
||||||
|
|
||||||
|
for match in LINK_RE.finditer(body):
|
||||||
|
url = match.group(1)
|
||||||
|
if is_external(url):
|
||||||
|
result.add(url)
|
||||||
|
|
||||||
|
return list(result)
|
||||||
|
|
||||||
|
|
||||||
|
def is_external(url):
|
||||||
|
return not url.startswith(("#", "/"))
|
195
run.py
Normal file
195
run.py
Normal file
@ -0,0 +1,195 @@
|
|||||||
|
import datetime
|
||||||
|
import json
|
||||||
|
import sqlite3
|
||||||
|
import sys
|
||||||
|
import traceback
|
||||||
|
from datetime import datetime
|
||||||
|
from queue import Queue
|
||||||
|
from threading import Thread
|
||||||
|
|
||||||
|
import pika
|
||||||
|
|
||||||
|
import monitoring
|
||||||
|
from chan import CHANS
|
||||||
|
from post_process import post_process
|
||||||
|
from util import logger, Web
|
||||||
|
|
||||||
|
MONITORING = True
|
||||||
|
|
||||||
|
|
||||||
|
class ChanScanner:
|
||||||
|
def __init__(self, helper):
|
||||||
|
self.web = Web(monitoring if MONITORING else None)
|
||||||
|
self.helper = helper
|
||||||
|
self.state = ChanState()
|
||||||
|
|
||||||
|
def _fetch_threads(self, board):
|
||||||
|
r = self.web.get(self.helper.threads_url(board))
|
||||||
|
if r.status_code == 200:
|
||||||
|
return r.json()
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _fetch_posts(self, board, thread):
|
||||||
|
r = self.web.get(self.helper.posts_url(board, thread))
|
||||||
|
if r.status_code == 200:
|
||||||
|
return r.json()
|
||||||
|
return {"posts": []}
|
||||||
|
|
||||||
|
def _threads(self, board):
|
||||||
|
for page in self._fetch_threads(board):
|
||||||
|
for thread in page["threads"]:
|
||||||
|
yield thread
|
||||||
|
|
||||||
|
def _posts(self, board):
|
||||||
|
for thread in sorted(self._threads(board), key=lambda x: x["no"]):
|
||||||
|
if self.state.has_new_posts(thread, self.helper):
|
||||||
|
for post in sorted(self._fetch_posts(board, thread["no"])["posts"], key=lambda x: x["no"]):
|
||||||
|
yield post
|
||||||
|
self.state.mark_thread_as_visited(thread, self.helper)
|
||||||
|
|
||||||
|
def all_posts(self):
|
||||||
|
for board in self.helper.boards:
|
||||||
|
for post in self._posts(board):
|
||||||
|
yield post, board
|
||||||
|
|
||||||
|
|
||||||
|
def once(func):
|
||||||
|
def wrapper(item, board, helper):
|
||||||
|
if not state.has_visited(item["no"], helper):
|
||||||
|
func(item, board, helper)
|
||||||
|
state.mark_visited(item["no"], helper)
|
||||||
|
|
||||||
|
return wrapper
|
||||||
|
|
||||||
|
|
||||||
|
class ChanState:
|
||||||
|
def __init__(self):
|
||||||
|
self._db = "state.db"
|
||||||
|
|
||||||
|
with sqlite3.connect(self._db) as conn:
|
||||||
|
conn.execute(
|
||||||
|
"CREATE TABLE IF NOT EXISTS posts "
|
||||||
|
"("
|
||||||
|
" post INT,"
|
||||||
|
" ts INT DEFAULT (strftime('%s','now')),"
|
||||||
|
" chan INT,"
|
||||||
|
" PRIMARY KEY(post, chan)"
|
||||||
|
")"
|
||||||
|
)
|
||||||
|
conn.execute(
|
||||||
|
"CREATE TABLE IF NOT EXISTS threads "
|
||||||
|
"("
|
||||||
|
" thread INT,"
|
||||||
|
" last_modified INT,"
|
||||||
|
" ts INT DEFAULT (strftime('%s','now')),"
|
||||||
|
" chan INT,"
|
||||||
|
" PRIMARY KEY(thread, chan)"
|
||||||
|
")"
|
||||||
|
)
|
||||||
|
conn.execute("PRAGMA journal_mode=wal")
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
def mark_visited(self, item: int, helper):
|
||||||
|
with sqlite3.connect(self._db) as conn:
|
||||||
|
conn.execute(
|
||||||
|
"INSERT INTO posts (post, chan) VALUES (?,?)",
|
||||||
|
(item, helper.db_id)
|
||||||
|
)
|
||||||
|
|
||||||
|
def has_visited(self, item: int, helper):
|
||||||
|
with sqlite3.connect(self._db) as conn:
|
||||||
|
cur = conn.cursor()
|
||||||
|
cur.execute(
|
||||||
|
"SELECT post FROM posts WHERE post=? AND chan=?",
|
||||||
|
(item, helper.db_id)
|
||||||
|
)
|
||||||
|
return cur.fetchone() is not None
|
||||||
|
|
||||||
|
def has_new_posts(self, thread, helper):
|
||||||
|
with sqlite3.connect(self._db) as conn:
|
||||||
|
cur = conn.cursor()
|
||||||
|
cur.execute(
|
||||||
|
"SELECT last_modified FROM threads WHERE thread=? AND chan=?",
|
||||||
|
(thread["no"], helper.db_id)
|
||||||
|
)
|
||||||
|
row = cur.fetchone()
|
||||||
|
if not row or thread["last_modified"] != row[0]:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def mark_thread_as_visited(self, thread, helper):
|
||||||
|
with sqlite3.connect(self._db) as conn:
|
||||||
|
conn.execute(
|
||||||
|
"INSERT INTO threads (thread, last_modified, chan) "
|
||||||
|
"VALUES (?,?,?) "
|
||||||
|
"ON CONFLICT (thread, chan) "
|
||||||
|
"DO UPDATE SET last_modified=?",
|
||||||
|
(thread["no"], thread["last_modified"], helper.db_id,
|
||||||
|
thread["last_modified"])
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
|
def publish_worker(queue: Queue, helper):
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
item, board = queue.get()
|
||||||
|
publish(item, board, helper)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(str(e) + ": " + traceback.format_exc())
|
||||||
|
finally:
|
||||||
|
queue.task_done()
|
||||||
|
|
||||||
|
|
||||||
|
@once
|
||||||
|
def publish(item, board, helper):
|
||||||
|
item_type = "thread" if "sub" in item else "post"
|
||||||
|
post_process(item, board, helper)
|
||||||
|
|
||||||
|
chan_channel.basic_publish(
|
||||||
|
exchange='chan',
|
||||||
|
routing_key="%d.%s.%s" % (helper.db_id, item_type, board),
|
||||||
|
body=json.dumps(item)
|
||||||
|
)
|
||||||
|
|
||||||
|
if MONITORING:
|
||||||
|
distance = datetime.utcnow() - datetime.fromtimestamp(item["time"])
|
||||||
|
monitoring.log([{
|
||||||
|
"measurement": helper.db_id,
|
||||||
|
"time": str(datetime.utcnow()),
|
||||||
|
"tags": {
|
||||||
|
"board": board
|
||||||
|
},
|
||||||
|
"fields": {
|
||||||
|
"distance": distance.total_seconds()
|
||||||
|
}
|
||||||
|
}])
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
if len(sys.argv) < 3:
|
||||||
|
logger.error("You must specify RabbitMQ host & chan!")
|
||||||
|
quit(1)
|
||||||
|
|
||||||
|
rabbitmq_host = sys.argv[1]
|
||||||
|
chan = sys.argv[2]
|
||||||
|
chan_helper = CHANS[chan]
|
||||||
|
|
||||||
|
if MONITORING:
|
||||||
|
monitoring.init()
|
||||||
|
state = ChanState()
|
||||||
|
|
||||||
|
publish_q = Queue()
|
||||||
|
publish_thread = Thread(target=publish_worker, args=(publish_q, chan_helper))
|
||||||
|
publish_thread.start()
|
||||||
|
|
||||||
|
rabbit = pika.BlockingConnection(pika.ConnectionParameters(host=rabbitmq_host))
|
||||||
|
chan_channel = rabbit.channel()
|
||||||
|
chan_channel.exchange_declare(exchange="chan", exchange_type="topic")
|
||||||
|
|
||||||
|
s = ChanScanner(chan_helper)
|
||||||
|
while True:
|
||||||
|
for p, b in s.all_posts():
|
||||||
|
publish_q.put((p, b))
|
60
util.py
Normal file
60
util.py
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from datetime import datetime
|
||||||
|
from logging import FileHandler, StreamHandler
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
last_time_called = dict()
|
||||||
|
|
||||||
|
logger = logging.getLogger("default")
|
||||||
|
logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
formatter = logging.Formatter('%(asctime)s %(levelname)-5s %(message)s')
|
||||||
|
file_handler = FileHandler("chan_feed.log")
|
||||||
|
file_handler.setFormatter(formatter)
|
||||||
|
for h in logger.handlers:
|
||||||
|
logger.removeHandler(h)
|
||||||
|
logger.addHandler(file_handler)
|
||||||
|
logger.addHandler(StreamHandler(sys.stdout))
|
||||||
|
|
||||||
|
|
||||||
|
def rate_limit(per_second):
|
||||||
|
min_interval = 1.0 / float(per_second)
|
||||||
|
|
||||||
|
def decorate(func):
|
||||||
|
last_time_called[func] = 0
|
||||||
|
|
||||||
|
def wrapper(*args, **kwargs):
|
||||||
|
elapsed = time.perf_counter() - last_time_called[func]
|
||||||
|
wait_time = min_interval - elapsed
|
||||||
|
if wait_time > 0:
|
||||||
|
time.sleep(wait_time)
|
||||||
|
|
||||||
|
last_time_called[func] = time.perf_counter()
|
||||||
|
return func(*args, **kwargs)
|
||||||
|
|
||||||
|
return wrapper
|
||||||
|
|
||||||
|
return decorate
|
||||||
|
|
||||||
|
|
||||||
|
class Web:
|
||||||
|
def __init__(self, monitoring):
|
||||||
|
self.session = requests.Session()
|
||||||
|
self.monitoring = monitoring
|
||||||
|
|
||||||
|
@rate_limit(1 / 2) # TODO: per chan rate limit?
|
||||||
|
def get(self, url, **kwargs):
|
||||||
|
r = self.session.get(url, **kwargs)
|
||||||
|
logger.debug("GET %s <%d>" % (url, r.status_code))
|
||||||
|
if self.monitoring:
|
||||||
|
self.monitoring.log([{
|
||||||
|
"measurement": "web",
|
||||||
|
"time": str(datetime.utcnow()),
|
||||||
|
"fields": {
|
||||||
|
"status_code": r.status_code
|
||||||
|
}
|
||||||
|
}])
|
||||||
|
return r
|
Loading…
x
Reference in New Issue
Block a user