From 6f4b19538bffdb83b1a0ae103a069e9a4a436de9 Mon Sep 17 00:00:00 2001 From: simon987 Date: Tue, 27 Aug 2019 17:28:51 -0400 Subject: [PATCH] Remove duplicate urls in post processing --- post_process.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/post_process.py b/post_process.py index c23ad93..43bbe07 100644 --- a/post_process.py +++ b/post_process.py @@ -6,22 +6,24 @@ INTERNAL_RE = re.compile(r"^https?://(reddit.com|redd.it|old.reddit.com|www.redd def post_process(thing): - thing["v"] = 1.1 + thing["v"] = 1.2 - thing["urls"] = [] + urls = set() if "body_html" in thing and thing["body_html"]: - thing["urls"].extend(get_links_from_body_html(thing["body_html"])) + urls.update(get_links_from_body_html(thing["body_html"])) elif "body" in thing and thing["body"]: - thing["urls"].extend(get_links_from_body(thing["body"])) + urls.update(get_links_from_body(thing["body"])) if "selftext_html" in thing and thing["selftext_html"]: - thing["urls"].extend(get_links_from_body_html(thing["selftext_html"])) + urls.update(get_links_from_body_html(thing["selftext_html"])) elif "selftext" in thing and thing["selftext"]: - thing["urls"].extend(get_links_from_body(thing["selftext"])) + urls.update(get_links_from_body(thing["selftext"])) if "url" in thing and thing["url"] and is_external(thing["url"]): - thing["urls"].append(thing["url"]) + urls.add(thing["url"]) + + thing["urls"] = list(urls) return thing