From 2e222774c1d018b3c7c09a4ffb7e649f8912573c Mon Sep 17 00:00:00 2001 From: simon987 Date: Thu, 15 Aug 2019 17:07:46 -0400 Subject: [PATCH] post_process tweak --- post_process.py | 30 ++++++++++++++++++++++++++---- run.py | 2 +- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/post_process.py b/post_process.py index 6e4d4f5..592e0d8 100644 --- a/post_process.py +++ b/post_process.py @@ -1,6 +1,7 @@ import re -LINK_RE = re.compile(r'href="([^"]+)"') +HTML_LINK_RE = re.compile(r'href="([^"]+)"') +LINK_RE = re.compile(r'\[.*\]\(([^)]+)\)') INTERNAL_RE = re.compile(r"^https?://(reddit.com|redd.it|old.reddit.com|www.reddit.com|np.reddit.com)/(.*)") @@ -8,10 +9,19 @@ def post_process(thing): thing["v"] = 1.0 thing["urls"] = [] + if "body_html" in thing and thing["body_html"]: - thing["urls"].extend(get_links_from_body(thing["body_html"])) - elif "selftext_html" in thing and thing["selftext_html"]: - thing["urls"].extend(get_links_from_body(thing["selftext_html"])) + thing["urls"].extend(get_links_from_body_html(thing["body_html"])) + elif "body" in thing and thing["body"]: + thing["urls"].extend(get_links_from_body(thing["body"])) + + if "selftext_html" in thing and thing["selftext_html"]: + thing["urls"].extend(get_links_from_body_html(thing["selftext_html"])) + elif "selftext" in thing and thing["selftext"]: + thing["urls"].extend(get_links_from_body(thing["selftext"])) + + if thing["urls"]: + print(thing["urls"]) if "url" in thing and thing["url"] and is_external(thing["url"]): thing["urls"].append(thing["url"]) @@ -19,9 +29,21 @@ def post_process(thing): return thing +def get_links_from_body_html(body): + result = set() + + for match in HTML_LINK_RE.finditer(body): + url = match.group(1) + if is_external(url): + result.add(url) + + return list(result) + + def get_links_from_body(body): result = set() + body = body.replace("\\)", "(") for match in LINK_RE.finditer(body): url = match.group(1) if is_external(url): diff --git a/run.py b/run.py index dc99cc9..c6d381f 100755 --- a/run.py +++ b/run.py @@ -93,7 +93,7 @@ def serialize(thing): "permalink": thing.permalink, "pinned": thing.pinned, "score": thing.score, - "selftext": thing.selftext, + "selftext": thing.selftext if hasattr(thing, "selftext") else None, "selftext_html": thing.selftext_html if hasattr(thing, "selftext_html") else None, "spoiler": thing.spoiler, "stickied": thing.stickied,