mirror of
https://github.com/simon987/chan_feed.git
synced 2025-04-10 14:06:42 +00:00
Initial commit
This commit is contained in:
commit
210d032703
6
.gitignore
vendored
Normal file
6
.gitignore
vendored
Normal file
@ -0,0 +1,6 @@
|
||||
.idea/
|
||||
__pychache__/
|
||||
*.pyc
|
||||
*.iml
|
||||
*.db
|
||||
*.log
|
13
README.md
Normal file
13
README.md
Normal file
@ -0,0 +1,13 @@
|
||||
### chan_feed
|
||||
|
||||
Daemon that fetches posts from compatible *chan
|
||||
image boards and publishes serialised JSON to RabbitMQ
|
||||
for real-time ingest.
|
||||
|
||||
Compatible image boards: 4chan, lainchan, uboachan,
|
||||
22chan, wizchan, 1chan.
|
||||
|
||||
Can optionally push monitoring data to InfluxDB. Below is an
|
||||
example of Grafana being used to display it.
|
||||
|
||||

|
92
chan.py
Normal file
92
chan.py
Normal file
@ -0,0 +1,92 @@
|
||||
class ChanHelper:
|
||||
def __init__(self, db_id, base_url, image_url, thread_path, image_path, boards):
|
||||
self.db_id = db_id
|
||||
self._base_url = base_url
|
||||
self._image_url = image_url
|
||||
self._thread_path = thread_path
|
||||
self._image_path = image_path
|
||||
self.boards = boards
|
||||
|
||||
def image_url(self, board, tim, extension):
|
||||
return "%s%s%s%s%s" % (self._image_url, board, self._image_path, tim, extension)
|
||||
|
||||
def threads_url(self, board):
|
||||
return "%s%s/threads.json" % (self._base_url, board)
|
||||
|
||||
def posts_url(self, board, thread):
|
||||
return "%s%s%s%d.json" % (self._base_url, board, self._thread_path, thread)
|
||||
|
||||
|
||||
CHANS = {
|
||||
"4chan": ChanHelper(
|
||||
1,
|
||||
"https://a.4cdn.org/",
|
||||
"https://i.4cdn.org/",
|
||||
"/thread/",
|
||||
"/",
|
||||
[
|
||||
"a", "b", "c", "d", "e", "f", "g", "gif", "h", "hr",
|
||||
"k", "m", "o", "p", "r", "s", "t", "u", "v", "vg",
|
||||
"vr", "w", "wg", "i", "ic", "r9k", "s4s", "vip", "qa",
|
||||
"cm", "hm", "lgbt", "y", "3", "aco", "adv", "an", "asp",
|
||||
"bant", "biz", "cgl", "ck", "co", "diy", "fa", "fit",
|
||||
"gd", "hc", "his", "int", "jp", "lit", "mlp", "mu", "n",
|
||||
"news", "out", "po", "pol", "qst", "sci", "soc", "sp",
|
||||
"tg", "toy", "trv", "tv", "vp", "wsg", "wsr", "x"
|
||||
]
|
||||
),
|
||||
"lainchan": ChanHelper(
|
||||
2,
|
||||
"https://lainchan.org/",
|
||||
"https://lainchan.org/",
|
||||
"/res/",
|
||||
"/src/",
|
||||
[
|
||||
"λ", "diy", "sec", "tech", "inter", "lit", "music", "vis",
|
||||
"hum", "drg", "zzz", "layer" "q", "r", "cult", "psy",
|
||||
"mega", "random"
|
||||
]
|
||||
),
|
||||
"uboachan": ChanHelper(
|
||||
3,
|
||||
"https://uboachan.net/",
|
||||
"https://uboachan.net/",
|
||||
"/res/",
|
||||
"/src/",
|
||||
[
|
||||
"yn", "yndd", "fg", "yume", "o", "lit", "media", "og",
|
||||
"ig", "2", "ot", "hikki", "cc", "x", "sugg"
|
||||
]
|
||||
),
|
||||
"22chan": ChanHelper(
|
||||
4,
|
||||
"https://22chan.org/",
|
||||
"https://22chan.org/",
|
||||
"/res/",
|
||||
"/src/",
|
||||
[
|
||||
"a", "b", "f", "feels", "i", "k", "mu", "pol", "sewers",
|
||||
"sg", "t", "vg"
|
||||
]
|
||||
),
|
||||
"wizchan": ChanHelper(
|
||||
5,
|
||||
"https://wizchan.org/",
|
||||
"https://wizchan.org/",
|
||||
"/res/",
|
||||
"/src/",
|
||||
[
|
||||
"wiz", "dep", "hob", "lounge", "jp", "meta", "games", "music",
|
||||
]
|
||||
),
|
||||
"1chan": ChanHelper(
|
||||
6,
|
||||
"https://www.1chan.net/",
|
||||
"https://www.1chan.net/",
|
||||
"/res/",
|
||||
"/src/",
|
||||
[
|
||||
"rails"
|
||||
]
|
||||
)
|
||||
}
|
BIN
monitoring.png
Normal file
BIN
monitoring.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 87 KiB |
18
monitoring.py
Normal file
18
monitoring.py
Normal file
@ -0,0 +1,18 @@
|
||||
from influxdb import InfluxDBClient
|
||||
|
||||
client = InfluxDBClient("localhost", 8086, "root", "root", "chan_feed")
|
||||
|
||||
|
||||
def init():
|
||||
db_exists = False
|
||||
for db in client.get_list_database():
|
||||
if db["name"] == "chan_feed":
|
||||
db_exists = True
|
||||
break
|
||||
|
||||
if not db_exists:
|
||||
client.create_database("chan_feed")
|
||||
|
||||
|
||||
def log(event):
|
||||
client.write_points(event)
|
45
post_process.py
Normal file
45
post_process.py
Normal file
@ -0,0 +1,45 @@
|
||||
import re
|
||||
|
||||
LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))")
|
||||
|
||||
|
||||
def post_process(thing, board, helper):
|
||||
thing["v"] = 1.0
|
||||
|
||||
thing["board"] = board
|
||||
thing["chan"] = helper.db_id
|
||||
|
||||
if "com" in thing and thing["com"]:
|
||||
thing["urls"] = get_links_from_body(thing["com"])
|
||||
elif "sub" in thing and thing["sub"]:
|
||||
thing["urls"] = get_links_from_body(thing["sub"])
|
||||
if "fsize" in thing and thing["fsize"]:
|
||||
url = helper.image_url(board, thing["tim"], thing["ext"])
|
||||
if "urls" in thing:
|
||||
thing["urls"].append(url)
|
||||
else:
|
||||
thing["urls"] = [url]
|
||||
if "urls" not in thing:
|
||||
thing["urls"] = []
|
||||
|
||||
return thing
|
||||
|
||||
|
||||
def get_links_from_body(body):
|
||||
result = set()
|
||||
|
||||
body = body \
|
||||
.replace("<wbr>", "") \
|
||||
.replace("</s>", "") \
|
||||
.replace(" dot ", ".")
|
||||
|
||||
for match in LINK_RE.finditer(body):
|
||||
url = match.group(1)
|
||||
if is_external(url):
|
||||
result.add(url)
|
||||
|
||||
return list(result)
|
||||
|
||||
|
||||
def is_external(url):
|
||||
return not url.startswith(("#", "/"))
|
195
run.py
Normal file
195
run.py
Normal file
@ -0,0 +1,195 @@
|
||||
import datetime
|
||||
import json
|
||||
import sqlite3
|
||||
import sys
|
||||
import traceback
|
||||
from datetime import datetime
|
||||
from queue import Queue
|
||||
from threading import Thread
|
||||
|
||||
import pika
|
||||
|
||||
import monitoring
|
||||
from chan import CHANS
|
||||
from post_process import post_process
|
||||
from util import logger, Web
|
||||
|
||||
MONITORING = True
|
||||
|
||||
|
||||
class ChanScanner:
|
||||
def __init__(self, helper):
|
||||
self.web = Web(monitoring if MONITORING else None)
|
||||
self.helper = helper
|
||||
self.state = ChanState()
|
||||
|
||||
def _fetch_threads(self, board):
|
||||
r = self.web.get(self.helper.threads_url(board))
|
||||
if r.status_code == 200:
|
||||
return r.json()
|
||||
return []
|
||||
|
||||
def _fetch_posts(self, board, thread):
|
||||
r = self.web.get(self.helper.posts_url(board, thread))
|
||||
if r.status_code == 200:
|
||||
return r.json()
|
||||
return {"posts": []}
|
||||
|
||||
def _threads(self, board):
|
||||
for page in self._fetch_threads(board):
|
||||
for thread in page["threads"]:
|
||||
yield thread
|
||||
|
||||
def _posts(self, board):
|
||||
for thread in sorted(self._threads(board), key=lambda x: x["no"]):
|
||||
if self.state.has_new_posts(thread, self.helper):
|
||||
for post in sorted(self._fetch_posts(board, thread["no"])["posts"], key=lambda x: x["no"]):
|
||||
yield post
|
||||
self.state.mark_thread_as_visited(thread, self.helper)
|
||||
|
||||
def all_posts(self):
|
||||
for board in self.helper.boards:
|
||||
for post in self._posts(board):
|
||||
yield post, board
|
||||
|
||||
|
||||
def once(func):
|
||||
def wrapper(item, board, helper):
|
||||
if not state.has_visited(item["no"], helper):
|
||||
func(item, board, helper)
|
||||
state.mark_visited(item["no"], helper)
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
class ChanState:
|
||||
def __init__(self):
|
||||
self._db = "state.db"
|
||||
|
||||
with sqlite3.connect(self._db) as conn:
|
||||
conn.execute(
|
||||
"CREATE TABLE IF NOT EXISTS posts "
|
||||
"("
|
||||
" post INT,"
|
||||
" ts INT DEFAULT (strftime('%s','now')),"
|
||||
" chan INT,"
|
||||
" PRIMARY KEY(post, chan)"
|
||||
")"
|
||||
)
|
||||
conn.execute(
|
||||
"CREATE TABLE IF NOT EXISTS threads "
|
||||
"("
|
||||
" thread INT,"
|
||||
" last_modified INT,"
|
||||
" ts INT DEFAULT (strftime('%s','now')),"
|
||||
" chan INT,"
|
||||
" PRIMARY KEY(thread, chan)"
|
||||
")"
|
||||
)
|
||||
conn.execute("PRAGMA journal_mode=wal")
|
||||
conn.commit()
|
||||
|
||||
def mark_visited(self, item: int, helper):
|
||||
with sqlite3.connect(self._db) as conn:
|
||||
conn.execute(
|
||||
"INSERT INTO posts (post, chan) VALUES (?,?)",
|
||||
(item, helper.db_id)
|
||||
)
|
||||
|
||||
def has_visited(self, item: int, helper):
|
||||
with sqlite3.connect(self._db) as conn:
|
||||
cur = conn.cursor()
|
||||
cur.execute(
|
||||
"SELECT post FROM posts WHERE post=? AND chan=?",
|
||||
(item, helper.db_id)
|
||||
)
|
||||
return cur.fetchone() is not None
|
||||
|
||||
def has_new_posts(self, thread, helper):
|
||||
with sqlite3.connect(self._db) as conn:
|
||||
cur = conn.cursor()
|
||||
cur.execute(
|
||||
"SELECT last_modified FROM threads WHERE thread=? AND chan=?",
|
||||
(thread["no"], helper.db_id)
|
||||
)
|
||||
row = cur.fetchone()
|
||||
if not row or thread["last_modified"] != row[0]:
|
||||
return True
|
||||
return False
|
||||
|
||||
def mark_thread_as_visited(self, thread, helper):
|
||||
with sqlite3.connect(self._db) as conn:
|
||||
conn.execute(
|
||||
"INSERT INTO threads (thread, last_modified, chan) "
|
||||
"VALUES (?,?,?) "
|
||||
"ON CONFLICT (thread, chan) "
|
||||
"DO UPDATE SET last_modified=?",
|
||||
(thread["no"], thread["last_modified"], helper.db_id,
|
||||
thread["last_modified"])
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def publish_worker(queue: Queue, helper):
|
||||
while True:
|
||||
try:
|
||||
item, board = queue.get()
|
||||
publish(item, board, helper)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(str(e) + ": " + traceback.format_exc())
|
||||
finally:
|
||||
queue.task_done()
|
||||
|
||||
|
||||
@once
|
||||
def publish(item, board, helper):
|
||||
item_type = "thread" if "sub" in item else "post"
|
||||
post_process(item, board, helper)
|
||||
|
||||
chan_channel.basic_publish(
|
||||
exchange='chan',
|
||||
routing_key="%d.%s.%s" % (helper.db_id, item_type, board),
|
||||
body=json.dumps(item)
|
||||
)
|
||||
|
||||
if MONITORING:
|
||||
distance = datetime.utcnow() - datetime.fromtimestamp(item["time"])
|
||||
monitoring.log([{
|
||||
"measurement": helper.db_id,
|
||||
"time": str(datetime.utcnow()),
|
||||
"tags": {
|
||||
"board": board
|
||||
},
|
||||
"fields": {
|
||||
"distance": distance.total_seconds()
|
||||
}
|
||||
}])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
if len(sys.argv) < 3:
|
||||
logger.error("You must specify RabbitMQ host & chan!")
|
||||
quit(1)
|
||||
|
||||
rabbitmq_host = sys.argv[1]
|
||||
chan = sys.argv[2]
|
||||
chan_helper = CHANS[chan]
|
||||
|
||||
if MONITORING:
|
||||
monitoring.init()
|
||||
state = ChanState()
|
||||
|
||||
publish_q = Queue()
|
||||
publish_thread = Thread(target=publish_worker, args=(publish_q, chan_helper))
|
||||
publish_thread.start()
|
||||
|
||||
rabbit = pika.BlockingConnection(pika.ConnectionParameters(host=rabbitmq_host))
|
||||
chan_channel = rabbit.channel()
|
||||
chan_channel.exchange_declare(exchange="chan", exchange_type="topic")
|
||||
|
||||
s = ChanScanner(chan_helper)
|
||||
while True:
|
||||
for p, b in s.all_posts():
|
||||
publish_q.put((p, b))
|
60
util.py
Normal file
60
util.py
Normal file
@ -0,0 +1,60 @@
|
||||
import logging
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime
|
||||
from logging import FileHandler, StreamHandler
|
||||
|
||||
import requests
|
||||
|
||||
last_time_called = dict()
|
||||
|
||||
logger = logging.getLogger("default")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
formatter = logging.Formatter('%(asctime)s %(levelname)-5s %(message)s')
|
||||
file_handler = FileHandler("chan_feed.log")
|
||||
file_handler.setFormatter(formatter)
|
||||
for h in logger.handlers:
|
||||
logger.removeHandler(h)
|
||||
logger.addHandler(file_handler)
|
||||
logger.addHandler(StreamHandler(sys.stdout))
|
||||
|
||||
|
||||
def rate_limit(per_second):
|
||||
min_interval = 1.0 / float(per_second)
|
||||
|
||||
def decorate(func):
|
||||
last_time_called[func] = 0
|
||||
|
||||
def wrapper(*args, **kwargs):
|
||||
elapsed = time.perf_counter() - last_time_called[func]
|
||||
wait_time = min_interval - elapsed
|
||||
if wait_time > 0:
|
||||
time.sleep(wait_time)
|
||||
|
||||
last_time_called[func] = time.perf_counter()
|
||||
return func(*args, **kwargs)
|
||||
|
||||
return wrapper
|
||||
|
||||
return decorate
|
||||
|
||||
|
||||
class Web:
|
||||
def __init__(self, monitoring):
|
||||
self.session = requests.Session()
|
||||
self.monitoring = monitoring
|
||||
|
||||
@rate_limit(1 / 2) # TODO: per chan rate limit?
|
||||
def get(self, url, **kwargs):
|
||||
r = self.session.get(url, **kwargs)
|
||||
logger.debug("GET %s <%d>" % (url, r.status_code))
|
||||
if self.monitoring:
|
||||
self.monitoring.log([{
|
||||
"measurement": "web",
|
||||
"time": str(datetime.utcnow()),
|
||||
"fields": {
|
||||
"status_code": r.status_code
|
||||
}
|
||||
}])
|
||||
return r
|
Loading…
x
Reference in New Issue
Block a user