mirror of
https://github.com/simon987/chan_feed.git
synced 2025-04-19 02:06:42 +00:00
46 lines
1.1 KiB
Python
46 lines
1.1 KiB
Python
import re
|
|
|
|
LINK_RE = re.compile(r"(https?://[\w\-_.]+\.[a-z]{2,4}([^\s<'\"]*|$))")
|
|
|
|
|
|
def post_process(thing, board, helper):
|
|
thing["v"] = 1.0
|
|
|
|
thing["board"] = board
|
|
thing["chan"] = helper.db_id
|
|
|
|
if "com" in thing and thing["com"]:
|
|
thing["urls"] = get_links_from_body(thing["com"])
|
|
elif "sub" in thing and thing["sub"]:
|
|
thing["urls"] = get_links_from_body(thing["sub"])
|
|
if "fsize" in thing and thing["fsize"]:
|
|
url = helper.image_url(board, thing["tim"], thing["ext"])
|
|
if "urls" in thing:
|
|
thing["urls"].append(url)
|
|
else:
|
|
thing["urls"] = [url]
|
|
if "urls" not in thing:
|
|
thing["urls"] = []
|
|
|
|
return thing
|
|
|
|
|
|
def get_links_from_body(body):
|
|
result = set()
|
|
|
|
body = body \
|
|
.replace("<wbr>", "") \
|
|
.replace("</s>", "") \
|
|
.replace(" dot ", ".")
|
|
|
|
for match in LINK_RE.finditer(body):
|
|
url = match.group(1)
|
|
if is_external(url):
|
|
result.add(url)
|
|
|
|
return list(result)
|
|
|
|
|
|
def is_external(url):
|
|
return not url.startswith(("#", "/"))
|