mirror of
				https://github.com/simon987/chan_feed.git
				synced 2025-10-28 05:46:53 +00:00 
			
		
		
		
	Initial commit
This commit is contained in:
		
						commit
						210d032703
					
				
							
								
								
									
										6
									
								
								.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										6
									
								
								.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @ -0,0 +1,6 @@ | ||||
| .idea/ | ||||
| __pychache__/ | ||||
| *.pyc | ||||
| *.iml | ||||
| *.db | ||||
| *.log | ||||
							
								
								
									
										13
									
								
								README.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										13
									
								
								README.md
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,13 @@ | ||||
| ### chan_feed | ||||
| 
 | ||||
| Daemon that fetches posts from compatible *chan | ||||
| image boards and publishes serialised JSON to RabbitMQ | ||||
|  for real-time ingest. | ||||
|   | ||||
| Compatible image boards: 4chan, lainchan, uboachan, | ||||
| 22chan, wizchan, 1chan. | ||||
| 
 | ||||
| Can optionally push monitoring data to InfluxDB. Below is an | ||||
| example of Grafana being used to display it. | ||||
| 
 | ||||
|  | ||||
							
								
								
									
										92
									
								
								chan.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										92
									
								
								chan.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,92 @@ | ||||
| class ChanHelper: | ||||
|     def __init__(self, db_id, base_url, image_url, thread_path, image_path, boards): | ||||
|         self.db_id = db_id | ||||
|         self._base_url = base_url | ||||
|         self._image_url = image_url | ||||
|         self._thread_path = thread_path | ||||
|         self._image_path = image_path | ||||
|         self.boards = boards | ||||
| 
 | ||||
|     def image_url(self, board, tim, extension): | ||||
|         return "%s%s%s%s%s" % (self._image_url, board, self._image_path, tim, extension) | ||||
| 
 | ||||
|     def threads_url(self, board): | ||||
|         return "%s%s/threads.json" % (self._base_url, board) | ||||
| 
 | ||||
|     def posts_url(self, board, thread): | ||||
|         return "%s%s%s%d.json" % (self._base_url, board, self._thread_path, thread) | ||||
| 
 | ||||
| 
 | ||||
| CHANS = { | ||||
|     "4chan": ChanHelper( | ||||
|         1, | ||||
|         "https://a.4cdn.org/", | ||||
|         "https://i.4cdn.org/", | ||||
|         "/thread/", | ||||
|         "/", | ||||
|         [ | ||||
|             "a", "b", "c", "d", "e", "f", "g", "gif", "h", "hr", | ||||
|             "k", "m", "o", "p", "r", "s", "t", "u", "v", "vg", | ||||
|             "vr", "w", "wg", "i", "ic", "r9k", "s4s", "vip", "qa", | ||||
|             "cm", "hm", "lgbt", "y", "3", "aco", "adv", "an", "asp", | ||||
|             "bant", "biz", "cgl", "ck", "co", "diy", "fa", "fit", | ||||
|             "gd", "hc", "his", "int", "jp", "lit", "mlp", "mu", "n", | ||||
|             "news", "out", "po", "pol", "qst", "sci", "soc", "sp", | ||||
|             "tg", "toy", "trv", "tv", "vp", "wsg", "wsr", "x" | ||||
|         ] | ||||
|     ), | ||||
|     "lainchan": ChanHelper( | ||||
|         2, | ||||
|         "https://lainchan.org/", | ||||
|         "https://lainchan.org/", | ||||
|         "/res/", | ||||
|         "/src/", | ||||
|         [ | ||||
|             "λ", "diy", "sec", "tech", "inter", "lit", "music", "vis", | ||||
|             "hum", "drg", "zzz", "layer" "q", "r", "cult", "psy", | ||||
|             "mega", "random" | ||||
|         ] | ||||
|     ), | ||||
|     "uboachan": ChanHelper( | ||||
|         3, | ||||
|         "https://uboachan.net/", | ||||
|         "https://uboachan.net/", | ||||
|         "/res/", | ||||
|         "/src/", | ||||
|         [ | ||||
|             "yn", "yndd", "fg", "yume", "o", "lit", "media", "og", | ||||
|             "ig", "2", "ot", "hikki", "cc", "x", "sugg" | ||||
|         ] | ||||
|     ), | ||||
|     "22chan": ChanHelper( | ||||
|         4, | ||||
|         "https://22chan.org/", | ||||
|         "https://22chan.org/", | ||||
|         "/res/", | ||||
|         "/src/", | ||||
|         [ | ||||
|             "a", "b", "f", "feels", "i", "k", "mu", "pol", "sewers", | ||||
|             "sg", "t", "vg" | ||||
|         ] | ||||
|     ), | ||||
|     "wizchan": ChanHelper( | ||||
|         5, | ||||
|         "https://wizchan.org/", | ||||
|         "https://wizchan.org/", | ||||
|         "/res/", | ||||
|         "/src/", | ||||
|         [ | ||||
|             "wiz", "dep", "hob", "lounge", "jp", "meta", "games", "music", | ||||
|         ] | ||||
|     ), | ||||
|     "1chan": ChanHelper( | ||||
|         6, | ||||
|         "https://www.1chan.net/", | ||||
|         "https://www.1chan.net/", | ||||
|         "/res/", | ||||
|         "/src/", | ||||
|         [ | ||||
|             "rails" | ||||
|         ] | ||||
|     ) | ||||
| } | ||||
							
								
								
									
										
											BIN
										
									
								
								monitoring.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								monitoring.png
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| After Width: | Height: | Size: 87 KiB | 
							
								
								
									
										18
									
								
								monitoring.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								monitoring.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,18 @@ | ||||
| from influxdb import InfluxDBClient | ||||
| 
 | ||||
| client = InfluxDBClient("localhost", 8086, "root", "root", "chan_feed") | ||||
| 
 | ||||
| 
 | ||||
| def init(): | ||||
|     db_exists = False | ||||
|     for db in client.get_list_database(): | ||||
|         if db["name"] == "chan_feed": | ||||
|             db_exists = True | ||||
|             break | ||||
| 
 | ||||
|     if not db_exists: | ||||
|         client.create_database("chan_feed") | ||||
| 
 | ||||
| 
 | ||||
| def log(event): | ||||
|     client.write_points(event) | ||||
							
								
								
									
										45
									
								
								post_process.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										45
									
								
								post_process.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,45 @@ | ||||
| import re | ||||
| 
 | ||||
| LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))") | ||||
| 
 | ||||
| 
 | ||||
| def post_process(thing, board, helper): | ||||
|     thing["v"] = 1.0 | ||||
| 
 | ||||
|     thing["board"] = board | ||||
|     thing["chan"] = helper.db_id | ||||
| 
 | ||||
|     if "com" in thing and thing["com"]: | ||||
|         thing["urls"] = get_links_from_body(thing["com"]) | ||||
|     elif "sub" in thing and thing["sub"]: | ||||
|         thing["urls"] = get_links_from_body(thing["sub"]) | ||||
|     if "fsize" in thing and thing["fsize"]: | ||||
|         url = helper.image_url(board, thing["tim"], thing["ext"]) | ||||
|         if "urls" in thing: | ||||
|             thing["urls"].append(url) | ||||
|         else: | ||||
|             thing["urls"] = [url] | ||||
|     if "urls" not in thing: | ||||
|         thing["urls"] = [] | ||||
| 
 | ||||
|     return thing | ||||
| 
 | ||||
| 
 | ||||
| def get_links_from_body(body): | ||||
|     result = set() | ||||
| 
 | ||||
|     body = body \ | ||||
|         .replace("<wbr>", "") \ | ||||
|         .replace("</s>", "") \ | ||||
|         .replace(" dot ", ".") | ||||
| 
 | ||||
|     for match in LINK_RE.finditer(body): | ||||
|         url = match.group(1) | ||||
|         if is_external(url): | ||||
|             result.add(url) | ||||
| 
 | ||||
|     return list(result) | ||||
| 
 | ||||
| 
 | ||||
| def is_external(url): | ||||
|     return not url.startswith(("#", "/")) | ||||
							
								
								
									
										195
									
								
								run.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										195
									
								
								run.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,195 @@ | ||||
| import datetime | ||||
| import json | ||||
| import sqlite3 | ||||
| import sys | ||||
| import traceback | ||||
| from datetime import datetime | ||||
| from queue import Queue | ||||
| from threading import Thread | ||||
| 
 | ||||
| import pika | ||||
| 
 | ||||
| import monitoring | ||||
| from chan import CHANS | ||||
| from post_process import post_process | ||||
| from util import logger, Web | ||||
| 
 | ||||
| MONITORING = True | ||||
| 
 | ||||
| 
 | ||||
| class ChanScanner: | ||||
|     def __init__(self, helper): | ||||
|         self.web = Web(monitoring if MONITORING else None) | ||||
|         self.helper = helper | ||||
|         self.state = ChanState() | ||||
| 
 | ||||
|     def _fetch_threads(self, board): | ||||
|         r = self.web.get(self.helper.threads_url(board)) | ||||
|         if r.status_code == 200: | ||||
|             return r.json() | ||||
|         return [] | ||||
| 
 | ||||
|     def _fetch_posts(self, board, thread): | ||||
|         r = self.web.get(self.helper.posts_url(board, thread)) | ||||
|         if r.status_code == 200: | ||||
|             return r.json() | ||||
|         return {"posts": []} | ||||
| 
 | ||||
|     def _threads(self, board): | ||||
|         for page in self._fetch_threads(board): | ||||
|             for thread in page["threads"]: | ||||
|                 yield thread | ||||
| 
 | ||||
|     def _posts(self, board): | ||||
|         for thread in sorted(self._threads(board), key=lambda x: x["no"]): | ||||
|             if self.state.has_new_posts(thread, self.helper): | ||||
|                 for post in sorted(self._fetch_posts(board, thread["no"])["posts"], key=lambda x: x["no"]): | ||||
|                     yield post | ||||
|                 self.state.mark_thread_as_visited(thread, self.helper) | ||||
| 
 | ||||
|     def all_posts(self): | ||||
|         for board in self.helper.boards: | ||||
|             for post in self._posts(board): | ||||
|                 yield post, board | ||||
| 
 | ||||
| 
 | ||||
| def once(func): | ||||
|     def wrapper(item, board, helper): | ||||
|         if not state.has_visited(item["no"], helper): | ||||
|             func(item, board, helper) | ||||
|             state.mark_visited(item["no"], helper) | ||||
| 
 | ||||
|     return wrapper | ||||
| 
 | ||||
| 
 | ||||
| class ChanState: | ||||
|     def __init__(self): | ||||
|         self._db = "state.db" | ||||
| 
 | ||||
|         with sqlite3.connect(self._db) as conn: | ||||
|             conn.execute( | ||||
|                 "CREATE TABLE IF NOT EXISTS posts " | ||||
|                 "(" | ||||
|                 "   post INT," | ||||
|                 "   ts INT DEFAULT (strftime('%s','now'))," | ||||
|                 "   chan INT," | ||||
|                 "   PRIMARY KEY(post, chan)" | ||||
|                 ")" | ||||
|             ) | ||||
|             conn.execute( | ||||
|                 "CREATE TABLE IF NOT EXISTS threads " | ||||
|                 "(" | ||||
|                 "   thread INT," | ||||
|                 "   last_modified INT," | ||||
|                 "   ts INT DEFAULT (strftime('%s','now'))," | ||||
|                 "   chan INT," | ||||
|                 "   PRIMARY KEY(thread, chan)" | ||||
|                 ")" | ||||
|             ) | ||||
|             conn.execute("PRAGMA journal_mode=wal") | ||||
|             conn.commit() | ||||
| 
 | ||||
|     def mark_visited(self, item: int, helper): | ||||
|         with sqlite3.connect(self._db) as conn: | ||||
|             conn.execute( | ||||
|                 "INSERT INTO posts (post, chan) VALUES (?,?)", | ||||
|                 (item, helper.db_id) | ||||
|             ) | ||||
| 
 | ||||
|     def has_visited(self, item: int, helper): | ||||
|         with sqlite3.connect(self._db) as conn: | ||||
|             cur = conn.cursor() | ||||
|             cur.execute( | ||||
|                 "SELECT post FROM posts WHERE post=? AND chan=?", | ||||
|                 (item, helper.db_id) | ||||
|             ) | ||||
|             return cur.fetchone() is not None | ||||
| 
 | ||||
|     def has_new_posts(self, thread, helper): | ||||
|         with sqlite3.connect(self._db) as conn: | ||||
|             cur = conn.cursor() | ||||
|             cur.execute( | ||||
|                 "SELECT last_modified FROM threads WHERE thread=? AND chan=?", | ||||
|                 (thread["no"], helper.db_id) | ||||
|             ) | ||||
|             row = cur.fetchone() | ||||
|             if not row or thread["last_modified"] != row[0]: | ||||
|                 return True | ||||
|             return False | ||||
| 
 | ||||
|     def mark_thread_as_visited(self, thread, helper): | ||||
|         with sqlite3.connect(self._db) as conn: | ||||
|             conn.execute( | ||||
|                 "INSERT INTO threads (thread, last_modified, chan) " | ||||
|                 "VALUES (?,?,?) " | ||||
|                 "ON CONFLICT (thread, chan) " | ||||
|                 "DO UPDATE SET last_modified=?", | ||||
|                 (thread["no"], thread["last_modified"], helper.db_id, | ||||
|                  thread["last_modified"]) | ||||
|             ) | ||||
|             conn.commit() | ||||
| 
 | ||||
| 
 | ||||
| def publish_worker(queue: Queue, helper): | ||||
|     while True: | ||||
|         try: | ||||
|             item, board = queue.get() | ||||
|             publish(item, board, helper) | ||||
| 
 | ||||
|         except Exception as e: | ||||
|             logger.error(str(e) + ": " + traceback.format_exc()) | ||||
|         finally: | ||||
|             queue.task_done() | ||||
| 
 | ||||
| 
 | ||||
| @once | ||||
| def publish(item, board, helper): | ||||
|     item_type = "thread" if "sub" in item else "post" | ||||
|     post_process(item, board, helper) | ||||
| 
 | ||||
|     chan_channel.basic_publish( | ||||
|         exchange='chan', | ||||
|         routing_key="%d.%s.%s" % (helper.db_id, item_type, board), | ||||
|         body=json.dumps(item) | ||||
|     ) | ||||
| 
 | ||||
|     if MONITORING: | ||||
|         distance = datetime.utcnow() - datetime.fromtimestamp(item["time"]) | ||||
|         monitoring.log([{ | ||||
|             "measurement": helper.db_id, | ||||
|             "time": str(datetime.utcnow()), | ||||
|             "tags": { | ||||
|                 "board": board | ||||
|             }, | ||||
|             "fields": { | ||||
|                 "distance": distance.total_seconds() | ||||
|             } | ||||
|         }]) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
| 
 | ||||
|     if len(sys.argv) < 3: | ||||
|         logger.error("You must specify RabbitMQ host & chan!") | ||||
|         quit(1) | ||||
| 
 | ||||
|     rabbitmq_host = sys.argv[1] | ||||
|     chan = sys.argv[2] | ||||
|     chan_helper = CHANS[chan] | ||||
| 
 | ||||
|     if MONITORING: | ||||
|         monitoring.init() | ||||
|     state = ChanState() | ||||
| 
 | ||||
|     publish_q = Queue() | ||||
|     publish_thread = Thread(target=publish_worker, args=(publish_q, chan_helper)) | ||||
|     publish_thread.start() | ||||
| 
 | ||||
|     rabbit = pika.BlockingConnection(pika.ConnectionParameters(host=rabbitmq_host)) | ||||
|     chan_channel = rabbit.channel() | ||||
|     chan_channel.exchange_declare(exchange="chan", exchange_type="topic") | ||||
| 
 | ||||
|     s = ChanScanner(chan_helper) | ||||
|     while True: | ||||
|         for p, b in s.all_posts(): | ||||
|             publish_q.put((p, b)) | ||||
							
								
								
									
										60
									
								
								util.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										60
									
								
								util.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,60 @@ | ||||
| import logging | ||||
| import sys | ||||
| import time | ||||
| from datetime import datetime | ||||
| from logging import FileHandler, StreamHandler | ||||
| 
 | ||||
| import requests | ||||
| 
 | ||||
| last_time_called = dict() | ||||
| 
 | ||||
| logger = logging.getLogger("default") | ||||
| logger.setLevel(logging.DEBUG) | ||||
| 
 | ||||
| formatter = logging.Formatter('%(asctime)s %(levelname)-5s %(message)s') | ||||
| file_handler = FileHandler("chan_feed.log") | ||||
| file_handler.setFormatter(formatter) | ||||
| for h in logger.handlers: | ||||
|     logger.removeHandler(h) | ||||
| logger.addHandler(file_handler) | ||||
| logger.addHandler(StreamHandler(sys.stdout)) | ||||
| 
 | ||||
| 
 | ||||
| def rate_limit(per_second): | ||||
|     min_interval = 1.0 / float(per_second) | ||||
| 
 | ||||
|     def decorate(func): | ||||
|         last_time_called[func] = 0 | ||||
| 
 | ||||
|         def wrapper(*args, **kwargs): | ||||
|             elapsed = time.perf_counter() - last_time_called[func] | ||||
|             wait_time = min_interval - elapsed | ||||
|             if wait_time > 0: | ||||
|                 time.sleep(wait_time) | ||||
| 
 | ||||
|             last_time_called[func] = time.perf_counter() | ||||
|             return func(*args, **kwargs) | ||||
| 
 | ||||
|         return wrapper | ||||
| 
 | ||||
|     return decorate | ||||
| 
 | ||||
| 
 | ||||
| class Web: | ||||
|     def __init__(self, monitoring): | ||||
|         self.session = requests.Session() | ||||
|         self.monitoring = monitoring | ||||
| 
 | ||||
|     @rate_limit(1 / 2)  # TODO: per chan rate limit? | ||||
|     def get(self, url, **kwargs): | ||||
|         r = self.session.get(url, **kwargs) | ||||
|         logger.debug("GET %s <%d>" % (url, r.status_code)) | ||||
|         if self.monitoring: | ||||
|             self.monitoring.log([{ | ||||
|                 "measurement": "web", | ||||
|                 "time": str(datetime.utcnow()), | ||||
|                 "fields": { | ||||
|                     "status_code": r.status_code | ||||
|                 } | ||||
|             }]) | ||||
|         return r | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user