From 2f08467a832e7a59f62558b37144e13f3c0a277e Mon Sep 17 00:00:00 2001 From: simon987 Date: Mon, 12 Aug 2019 09:19:46 -0400 Subject: [PATCH] Add post-process pass --- monitoring.py | 2 +- post_process.py | 45 +++++++++++++++++++++++++++++++++++++++++++++ run.py | 14 ++++++++------ 3 files changed, 54 insertions(+), 7 deletions(-) create mode 100644 post_process.py diff --git a/monitoring.py b/monitoring.py index f21647e..7325d93 100644 --- a/monitoring.py +++ b/monitoring.py @@ -1,6 +1,6 @@ from influxdb import InfluxDBClient -client = InfluxDBClient('localhost', 8086, 'root', 'root', 'reddit_feed') +client = InfluxDBClient("localhost", 8086, "root", "root", "reddit_feed") def init(): diff --git a/post_process.py b/post_process.py new file mode 100644 index 0000000..6e4d4f5 --- /dev/null +++ b/post_process.py @@ -0,0 +1,45 @@ +import re + +LINK_RE = re.compile(r'href="([^"]+)"') +INTERNAL_RE = re.compile(r"^https?://(reddit.com|redd.it|old.reddit.com|www.reddit.com|np.reddit.com)/(.*)") + + +def post_process(thing): + thing["v"] = 1.0 + + thing["urls"] = [] + if "body_html" in thing and thing["body_html"]: + thing["urls"].extend(get_links_from_body(thing["body_html"])) + elif "selftext_html" in thing and thing["selftext_html"]: + thing["urls"].extend(get_links_from_body(thing["selftext_html"])) + + if "url" in thing and thing["url"] and is_external(thing["url"]): + thing["urls"].append(thing["url"]) + + return thing + + +def get_links_from_body(body): + result = set() + + for match in LINK_RE.finditer(body): + url = match.group(1) + if is_external(url): + result.add(url) + + return list(result) + + +def is_external(url): + + if INTERNAL_RE.match(url): + return False + + if "message/compose" in url: + return False + + if url.startswith(("/", "#", "