mirror of
https://github.com/simon987/reddit_feed.git
synced 2025-04-24 12:15:51 +00:00
Remove duplicate urls in post processing
This commit is contained in:
parent
6424506e06
commit
6f4b19538b
@ -6,22 +6,24 @@ INTERNAL_RE = re.compile(r"^https?://(reddit.com|redd.it|old.reddit.com|www.redd
|
|||||||
|
|
||||||
|
|
||||||
def post_process(thing):
|
def post_process(thing):
|
||||||
thing["v"] = 1.1
|
thing["v"] = 1.2
|
||||||
|
|
||||||
thing["urls"] = []
|
urls = set()
|
||||||
|
|
||||||
if "body_html" in thing and thing["body_html"]:
|
if "body_html" in thing and thing["body_html"]:
|
||||||
thing["urls"].extend(get_links_from_body_html(thing["body_html"]))
|
urls.update(get_links_from_body_html(thing["body_html"]))
|
||||||
elif "body" in thing and thing["body"]:
|
elif "body" in thing and thing["body"]:
|
||||||
thing["urls"].extend(get_links_from_body(thing["body"]))
|
urls.update(get_links_from_body(thing["body"]))
|
||||||
|
|
||||||
if "selftext_html" in thing and thing["selftext_html"]:
|
if "selftext_html" in thing and thing["selftext_html"]:
|
||||||
thing["urls"].extend(get_links_from_body_html(thing["selftext_html"]))
|
urls.update(get_links_from_body_html(thing["selftext_html"]))
|
||||||
elif "selftext" in thing and thing["selftext"]:
|
elif "selftext" in thing and thing["selftext"]:
|
||||||
thing["urls"].extend(get_links_from_body(thing["selftext"]))
|
urls.update(get_links_from_body(thing["selftext"]))
|
||||||
|
|
||||||
if "url" in thing and thing["url"] and is_external(thing["url"]):
|
if "url" in thing and thing["url"] and is_external(thing["url"]):
|
||||||
thing["urls"].append(thing["url"])
|
urls.add(thing["url"])
|
||||||
|
|
||||||
|
thing["urls"] = list(urls)
|
||||||
|
|
||||||
return thing
|
return thing
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user