mirror of
https://github.com/simon987/chan_feed.git
synced 2025-04-18 17:56:46 +00:00
Bug fixes, HTML boards WIP
This commit is contained in:
parent
50102ca4c8
commit
cfaa9d92c9
159
chan.py
159
chan.py
@ -1,4 +1,7 @@
|
|||||||
import json
|
import json
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from post_process import get_links_from_body
|
from post_process import get_links_from_body
|
||||||
|
|
||||||
@ -21,6 +24,95 @@ class ChanHelper:
|
|||||||
def posts_url(self, board, thread):
|
def posts_url(self, board, thread):
|
||||||
return "%s%s%s%d.json" % (self._base_url, board, self._thread_path, thread)
|
return "%s%s%s%d.json" % (self._base_url, board, self._thread_path, thread)
|
||||||
|
|
||||||
|
def board_hash(self, board):
|
||||||
|
return str((self.boards.index(board) + 1) * 10000)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def item_id(item):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def item_unique_id(self, item, board):
|
||||||
|
return int(self.board_hash(board) + str(self.item_id(item)))
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def thread_mtime(thread):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def item_urls(self, item, board):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def item_type(item):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def parse_threads_list(content):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def parse_thread(r):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
class HtmlChanHelper(ChanHelper):
|
||||||
|
|
||||||
|
def threads_url(self, board):
|
||||||
|
return "%s%s/" % (self._base_url, board)
|
||||||
|
|
||||||
|
def posts_url(self, board, thread):
|
||||||
|
return "%s%s%s%d.html" % (self._base_url, board, self._thread_path, thread)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def item_id(item):
|
||||||
|
return item["id"]
|
||||||
|
|
||||||
|
def item_urls(self, item, board):
|
||||||
|
return []
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def item_type(item):
|
||||||
|
return item["type"]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def thread_mtime(thread):
|
||||||
|
return -1
|
||||||
|
|
||||||
|
def parse_threads_list(self, r):
|
||||||
|
soup = BeautifulSoup(r.text, "html.parser")
|
||||||
|
|
||||||
|
threads = []
|
||||||
|
|
||||||
|
for threadEl in soup.find_all("div", attrs={"class": "opCell"}):
|
||||||
|
threads.append({
|
||||||
|
"id": int(threadEl.get("id")),
|
||||||
|
})
|
||||||
|
|
||||||
|
next_url = soup.find("a", attrs={"id": "linkNext"})
|
||||||
|
if next_url:
|
||||||
|
return threads, urljoin(r.url, next_url.get("href"))
|
||||||
|
return threads, None
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def parse_thread(r):
|
||||||
|
soup = BeautifulSoup(r.text, "html.parser")
|
||||||
|
|
||||||
|
op_el = soup.find("div", attrs={"class": "innerOP"})
|
||||||
|
yield {
|
||||||
|
"id": int(soup.find("div", attrs={"class": "opCell"}).get("id")),
|
||||||
|
"type": "thread",
|
||||||
|
"html": str(op_el),
|
||||||
|
}
|
||||||
|
|
||||||
|
for post_el in soup.find_all("div", attrs={"class": "postCell"}):
|
||||||
|
yield {
|
||||||
|
"id": int(post_el.get("id")),
|
||||||
|
"type": "post",
|
||||||
|
"html": str(post_el),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class JsonChanHelper(ChanHelper):
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def item_id(item):
|
def item_id(item):
|
||||||
return item["no"]
|
return item["no"]
|
||||||
@ -46,32 +138,34 @@ class ChanHelper:
|
|||||||
return thread["last_modified"]
|
return thread["last_modified"]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def parse_threads_list(content):
|
def parse_threads_list(r):
|
||||||
j = json.loads(content)
|
j = json.loads(r.text)
|
||||||
|
threads = []
|
||||||
for page in j:
|
for page in j:
|
||||||
for thread in page["threads"]:
|
for thread in page["threads"]:
|
||||||
yield thread
|
threads.append(thread)
|
||||||
|
return threads, None
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def parse_thread(content):
|
def parse_thread(r):
|
||||||
j = json.loads(content)
|
j = json.loads(r.text)
|
||||||
return j["posts"]
|
return j["posts"]
|
||||||
|
|
||||||
|
|
||||||
class RussianChanHelper(ChanHelper):
|
class RussianJsonChanHelper(ChanHelper):
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def item_id(item):
|
def item_id(item):
|
||||||
return int(item["num"])
|
return int(item["num"])
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def parse_threads_list(content):
|
def parse_threads_list(r):
|
||||||
j = json.loads(content)
|
j = json.loads(r.text)
|
||||||
return j["threads"]
|
return j["threads"], None
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def parse_thread(content):
|
def parse_thread(r):
|
||||||
j = json.loads(content)
|
j = json.loads(r.text)
|
||||||
for thread in j["threads"]:
|
for thread in j["threads"]:
|
||||||
for post in thread["posts"]:
|
for post in thread["posts"]:
|
||||||
yield post
|
yield post
|
||||||
@ -92,9 +186,6 @@ class RussianChanHelper(ChanHelper):
|
|||||||
elif "subject" in item and item["subject"]:
|
elif "subject" in item and item["subject"]:
|
||||||
urls.update(get_links_from_body(item["subject"]))
|
urls.update(get_links_from_body(item["subject"]))
|
||||||
|
|
||||||
if urls:
|
|
||||||
print(list(urls))
|
|
||||||
|
|
||||||
for file in item["files"]:
|
for file in item["files"]:
|
||||||
urls.add(self._base_url + file["path"])
|
urls.add(self._base_url + file["path"])
|
||||||
|
|
||||||
@ -102,7 +193,7 @@ class RussianChanHelper(ChanHelper):
|
|||||||
|
|
||||||
|
|
||||||
CHANS = {
|
CHANS = {
|
||||||
"4chan": ChanHelper(
|
"4chan": JsonChanHelper(
|
||||||
1,
|
1,
|
||||||
"https://a.4cdn.org/",
|
"https://a.4cdn.org/",
|
||||||
"https://i.4cdn.org/",
|
"https://i.4cdn.org/",
|
||||||
@ -119,7 +210,7 @@ CHANS = {
|
|||||||
"tg", "toy", "trv", "tv", "vp", "wsg", "wsr", "x"
|
"tg", "toy", "trv", "tv", "vp", "wsg", "wsr", "x"
|
||||||
]
|
]
|
||||||
),
|
),
|
||||||
"lainchan": ChanHelper(
|
"lainchan": JsonChanHelper(
|
||||||
2,
|
2,
|
||||||
"https://lainchan.org/",
|
"https://lainchan.org/",
|
||||||
"https://lainchan.org/",
|
"https://lainchan.org/",
|
||||||
@ -127,11 +218,11 @@ CHANS = {
|
|||||||
"/src/",
|
"/src/",
|
||||||
[
|
[
|
||||||
"λ", "diy", "sec", "tech", "inter", "lit", "music", "vis",
|
"λ", "diy", "sec", "tech", "inter", "lit", "music", "vis",
|
||||||
"hum", "drg", "zzz", "layer" "q", "r", "cult", "psy",
|
"hum", "drg", "zzz", "layer", "q", "r", "cult", "psy",
|
||||||
"mega", "random"
|
"mega",
|
||||||
]
|
]
|
||||||
),
|
),
|
||||||
"uboachan": ChanHelper(
|
"uboachan": JsonChanHelper(
|
||||||
3,
|
3,
|
||||||
"https://uboachan.net/",
|
"https://uboachan.net/",
|
||||||
"https://uboachan.net/",
|
"https://uboachan.net/",
|
||||||
@ -142,7 +233,7 @@ CHANS = {
|
|||||||
"ig", "2", "ot", "hikki", "cc", "x", "sugg"
|
"ig", "2", "ot", "hikki", "cc", "x", "sugg"
|
||||||
]
|
]
|
||||||
),
|
),
|
||||||
"22chan": ChanHelper(
|
"22chan": JsonChanHelper(
|
||||||
4,
|
4,
|
||||||
"https://22chan.org/",
|
"https://22chan.org/",
|
||||||
"https://22chan.org/",
|
"https://22chan.org/",
|
||||||
@ -153,7 +244,7 @@ CHANS = {
|
|||||||
"sg", "t", "vg"
|
"sg", "t", "vg"
|
||||||
]
|
]
|
||||||
),
|
),
|
||||||
"wizchan": ChanHelper(
|
"wizchan": JsonChanHelper(
|
||||||
5,
|
5,
|
||||||
"https://wizchan.org/",
|
"https://wizchan.org/",
|
||||||
"https://wizchan.org/",
|
"https://wizchan.org/",
|
||||||
@ -163,7 +254,18 @@ CHANS = {
|
|||||||
"wiz", "dep", "hob", "lounge", "jp", "meta", "games", "music",
|
"wiz", "dep", "hob", "lounge", "jp", "meta", "games", "music",
|
||||||
]
|
]
|
||||||
),
|
),
|
||||||
"2chhk": RussianChanHelper(
|
# TODO
|
||||||
|
"1chan": ChanHelper(
|
||||||
|
6,
|
||||||
|
"https://www.1chan.net/",
|
||||||
|
"https://www.1chan.net/",
|
||||||
|
"/res/",
|
||||||
|
"/src/",
|
||||||
|
[
|
||||||
|
"rails"
|
||||||
|
],
|
||||||
|
),
|
||||||
|
"2chhk": RussianJsonChanHelper(
|
||||||
7,
|
7,
|
||||||
"https://2ch.hk/",
|
"https://2ch.hk/",
|
||||||
"https://2ch.hk/",
|
"https://2ch.hk/",
|
||||||
@ -181,5 +283,16 @@ CHANS = {
|
|||||||
"a", "fd", "ja", "ma", "vn", "fg", "fur", "gg", "ga",
|
"a", "fd", "ja", "ma", "vn", "fg", "fur", "gg", "ga",
|
||||||
"vape", "h", "ho", "hc", "e", "fet", "sex", "fag"
|
"vape", "h", "ho", "hc", "e", "fet", "sex", "fag"
|
||||||
],
|
],
|
||||||
)
|
),
|
||||||
|
# TODO
|
||||||
|
"endchan": HtmlChanHelper(
|
||||||
|
8,
|
||||||
|
"https://endchan.net/",
|
||||||
|
"https://endchan.net/",
|
||||||
|
"/res/",
|
||||||
|
"/.media/",
|
||||||
|
[
|
||||||
|
"yuri"
|
||||||
|
],
|
||||||
|
),
|
||||||
}
|
}
|
||||||
|
@ -4,8 +4,8 @@ LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))")
|
|||||||
|
|
||||||
|
|
||||||
def post_process(item, board, helper):
|
def post_process(item, board, helper):
|
||||||
item["_v"] = 1.2
|
item["_v"] = 1.3
|
||||||
item["_id"] = helper.item_id(item)
|
item["_id"] = helper.item_unique_id(item, board)
|
||||||
|
|
||||||
item["_board"] = board
|
item["_board"] = board
|
||||||
item["_chan"] = helper.db_id
|
item["_chan"] = helper.db_id
|
||||||
|
@ -3,3 +3,4 @@ requests[socks]
|
|||||||
stem
|
stem
|
||||||
influxdb
|
influxdb
|
||||||
pika
|
pika
|
||||||
|
bs4
|
77
run.py
77
run.py
@ -25,22 +25,31 @@ class ChanScanner:
|
|||||||
|
|
||||||
def _threads(self, board):
|
def _threads(self, board):
|
||||||
r = self.web.get(self.helper.threads_url(board))
|
r = self.web.get(self.helper.threads_url(board))
|
||||||
if r.status_code == 200:
|
if r.status_code != 200:
|
||||||
return self.helper.parse_threads_list(r.text)
|
return []
|
||||||
return []
|
|
||||||
|
while True:
|
||||||
|
threads, next_url = self.helper.parse_threads_list(r)
|
||||||
|
for thread in threads:
|
||||||
|
yield thread
|
||||||
|
if not next_url:
|
||||||
|
break
|
||||||
|
r = self.web.get(next_url)
|
||||||
|
if r.status_code != 200:
|
||||||
|
break
|
||||||
|
|
||||||
def _fetch_posts(self, board, thread):
|
def _fetch_posts(self, board, thread):
|
||||||
r = self.web.get(self.helper.posts_url(board, thread))
|
r = self.web.get(self.helper.posts_url(board, thread))
|
||||||
if r.status_code == 200:
|
if r.status_code == 200:
|
||||||
return self.helper.parse_thread(r.text)
|
return self.helper.parse_thread(r)
|
||||||
return []
|
return []
|
||||||
|
|
||||||
def _posts(self, board):
|
def _posts(self, board):
|
||||||
for thread in self._threads(board):
|
for thread in self._threads(board):
|
||||||
if self.state.has_new_posts(thread, self.helper):
|
if self.state.has_new_posts(thread, self.helper, board):
|
||||||
for post in self._fetch_posts(board, self.helper.item_id(thread)):
|
for post in self._fetch_posts(board, self.helper.item_id(thread)):
|
||||||
yield post
|
yield post
|
||||||
self.state.mark_thread_as_visited(thread, self.helper)
|
self.state.mark_thread_as_visited(thread, self.helper, board)
|
||||||
|
|
||||||
def all_posts(self):
|
def all_posts(self):
|
||||||
for board in self.helper.boards:
|
for board in self.helper.boards:
|
||||||
@ -50,9 +59,9 @@ class ChanScanner:
|
|||||||
|
|
||||||
def once(func):
|
def once(func):
|
||||||
def wrapper(item, board, helper):
|
def wrapper(item, board, helper):
|
||||||
if not state.has_visited(helper.item_id(item), helper):
|
if not state.has_visited(helper.item_unique_id(item, board), helper):
|
||||||
func(item, board, helper)
|
func(item, board, helper)
|
||||||
state.mark_visited(helper.item_id(item), helper)
|
state.mark_visited(helper.item_unique_id(item, board), helper)
|
||||||
|
|
||||||
return wrapper
|
return wrapper
|
||||||
|
|
||||||
@ -100,26 +109,30 @@ class ChanState:
|
|||||||
)
|
)
|
||||||
return cur.fetchone() is not None
|
return cur.fetchone() is not None
|
||||||
|
|
||||||
def has_new_posts(self, thread, helper):
|
def has_new_posts(self, thread, helper, board):
|
||||||
|
mtime = helper.thread_mtime(thread)
|
||||||
|
if mtime == -1:
|
||||||
|
return True
|
||||||
|
|
||||||
with sqlite3.connect(self._db, timeout=5000) as conn:
|
with sqlite3.connect(self._db, timeout=5000) as conn:
|
||||||
cur = conn.cursor()
|
cur = conn.cursor()
|
||||||
cur.execute(
|
cur.execute(
|
||||||
"SELECT last_modified FROM threads WHERE thread=? AND chan=?",
|
"SELECT last_modified FROM threads WHERE thread=? AND chan=?",
|
||||||
(helper.item_id(thread), helper.db_id)
|
(helper.item_unique_id(thread, board), helper.db_id)
|
||||||
)
|
)
|
||||||
row = cur.fetchone()
|
row = cur.fetchone()
|
||||||
if not row or helper.thread_mtime(thread) != row[0]:
|
if not row or helper.thread_mtime(thread) != row[0]:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def mark_thread_as_visited(self, thread, helper):
|
def mark_thread_as_visited(self, thread, helper, board):
|
||||||
with sqlite3.connect(self._db, timeout=5000) as conn:
|
with sqlite3.connect(self._db, timeout=5000) as conn:
|
||||||
conn.execute(
|
conn.execute(
|
||||||
"INSERT INTO threads (thread, last_modified, chan) "
|
"INSERT INTO threads (thread, last_modified, chan) "
|
||||||
"VALUES (?,?,?) "
|
"VALUES (?,?,?) "
|
||||||
"ON CONFLICT (thread, chan) "
|
"ON CONFLICT (thread, chan) "
|
||||||
"DO UPDATE SET last_modified=?",
|
"DO UPDATE SET last_modified=?",
|
||||||
(helper.item_id(thread), helper.thread_mtime(thread), helper.db_id,
|
(helper.item_unique_id(thread, board), helper.thread_mtime(thread), helper.db_id,
|
||||||
helper.thread_mtime(thread))
|
helper.thread_mtime(thread))
|
||||||
)
|
)
|
||||||
conn.commit()
|
conn.commit()
|
||||||
@ -142,24 +155,30 @@ def publish(item, board, helper):
|
|||||||
item_type = helper.item_type(item)
|
item_type = helper.item_type(item)
|
||||||
post_process(item, board, helper)
|
post_process(item, board, helper)
|
||||||
|
|
||||||
chan_channel.basic_publish(
|
while True:
|
||||||
exchange='chan',
|
try:
|
||||||
routing_key="%s.%s.%s" % (chan, item_type, board),
|
chan_channel.basic_publish(
|
||||||
body=json.dumps(item)
|
exchange='chan',
|
||||||
)
|
routing_key="%s.%s.%s" % (chan, item_type, board),
|
||||||
|
body=json.dumps(item)
|
||||||
|
)
|
||||||
|
|
||||||
if MONITORING:
|
if MONITORING:
|
||||||
distance = datetime.utcnow() - datetime.fromtimestamp(helper.item_mtime(item))
|
distance = datetime.utcnow() - datetime.fromtimestamp(helper.item_mtime(item))
|
||||||
monitoring.log([{
|
monitoring.log([{
|
||||||
"measurement": chan,
|
"measurement": chan,
|
||||||
"time": str(datetime.utcnow()),
|
"time": str(datetime.utcnow()),
|
||||||
"tags": {
|
"tags": {
|
||||||
"board": board
|
"board": board
|
||||||
},
|
},
|
||||||
"fields": {
|
"fields": {
|
||||||
"distance": distance.total_seconds()
|
"distance": distance.total_seconds()
|
||||||
}
|
}
|
||||||
}])
|
}])
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(traceback.format_exc())
|
||||||
|
logger.error(str(e))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
Loading…
x
Reference in New Issue
Block a user