mirror of
https://github.com/simon987/reddit_feed.git
synced 2025-04-24 12:15:51 +00:00
post_process tweak
This commit is contained in:
parent
342397b93e
commit
2e222774c1
@ -1,6 +1,7 @@
|
|||||||
import re
|
import re
|
||||||
|
|
||||||
LINK_RE = re.compile(r'href="([^"]+)"')
|
HTML_LINK_RE = re.compile(r'href="([^"]+)"')
|
||||||
|
LINK_RE = re.compile(r'\[.*\]\(([^)]+)\)')
|
||||||
INTERNAL_RE = re.compile(r"^https?://(reddit.com|redd.it|old.reddit.com|www.reddit.com|np.reddit.com)/(.*)")
|
INTERNAL_RE = re.compile(r"^https?://(reddit.com|redd.it|old.reddit.com|www.reddit.com|np.reddit.com)/(.*)")
|
||||||
|
|
||||||
|
|
||||||
@ -8,10 +9,19 @@ def post_process(thing):
|
|||||||
thing["v"] = 1.0
|
thing["v"] = 1.0
|
||||||
|
|
||||||
thing["urls"] = []
|
thing["urls"] = []
|
||||||
|
|
||||||
if "body_html" in thing and thing["body_html"]:
|
if "body_html" in thing and thing["body_html"]:
|
||||||
thing["urls"].extend(get_links_from_body(thing["body_html"]))
|
thing["urls"].extend(get_links_from_body_html(thing["body_html"]))
|
||||||
elif "selftext_html" in thing and thing["selftext_html"]:
|
elif "body" in thing and thing["body"]:
|
||||||
thing["urls"].extend(get_links_from_body(thing["selftext_html"]))
|
thing["urls"].extend(get_links_from_body(thing["body"]))
|
||||||
|
|
||||||
|
if "selftext_html" in thing and thing["selftext_html"]:
|
||||||
|
thing["urls"].extend(get_links_from_body_html(thing["selftext_html"]))
|
||||||
|
elif "selftext" in thing and thing["selftext"]:
|
||||||
|
thing["urls"].extend(get_links_from_body(thing["selftext"]))
|
||||||
|
|
||||||
|
if thing["urls"]:
|
||||||
|
print(thing["urls"])
|
||||||
|
|
||||||
if "url" in thing and thing["url"] and is_external(thing["url"]):
|
if "url" in thing and thing["url"] and is_external(thing["url"]):
|
||||||
thing["urls"].append(thing["url"])
|
thing["urls"].append(thing["url"])
|
||||||
@ -19,9 +29,21 @@ def post_process(thing):
|
|||||||
return thing
|
return thing
|
||||||
|
|
||||||
|
|
||||||
|
def get_links_from_body_html(body):
|
||||||
|
result = set()
|
||||||
|
|
||||||
|
for match in HTML_LINK_RE.finditer(body):
|
||||||
|
url = match.group(1)
|
||||||
|
if is_external(url):
|
||||||
|
result.add(url)
|
||||||
|
|
||||||
|
return list(result)
|
||||||
|
|
||||||
|
|
||||||
def get_links_from_body(body):
|
def get_links_from_body(body):
|
||||||
result = set()
|
result = set()
|
||||||
|
|
||||||
|
body = body.replace("\\)", "(")
|
||||||
for match in LINK_RE.finditer(body):
|
for match in LINK_RE.finditer(body):
|
||||||
url = match.group(1)
|
url = match.group(1)
|
||||||
if is_external(url):
|
if is_external(url):
|
||||||
|
2
run.py
2
run.py
@ -93,7 +93,7 @@ def serialize(thing):
|
|||||||
"permalink": thing.permalink,
|
"permalink": thing.permalink,
|
||||||
"pinned": thing.pinned,
|
"pinned": thing.pinned,
|
||||||
"score": thing.score,
|
"score": thing.score,
|
||||||
"selftext": thing.selftext,
|
"selftext": thing.selftext if hasattr(thing, "selftext") else None,
|
||||||
"selftext_html": thing.selftext_html if hasattr(thing, "selftext_html") else None,
|
"selftext_html": thing.selftext_html if hasattr(thing, "selftext_html") else None,
|
||||||
"spoiler": thing.spoiler,
|
"spoiler": thing.spoiler,
|
||||||
"stickied": thing.stickied,
|
"stickied": thing.stickied,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user