post_process tweak

This commit is contained in:
simon987 2019-08-15 17:07:46 -04:00
parent 342397b93e
commit 2e222774c1
2 changed files with 27 additions and 5 deletions

View File

@ -1,6 +1,7 @@
import re
LINK_RE = re.compile(r'href="([^"]+)"')
HTML_LINK_RE = re.compile(r'href="([^"]+)"')
LINK_RE = re.compile(r'\[.*\]\(([^)]+)\)')
INTERNAL_RE = re.compile(r"^https?://(reddit.com|redd.it|old.reddit.com|www.reddit.com|np.reddit.com)/(.*)")
@ -8,10 +9,19 @@ def post_process(thing):
thing["v"] = 1.0
thing["urls"] = []
if "body_html" in thing and thing["body_html"]:
thing["urls"].extend(get_links_from_body(thing["body_html"]))
elif "selftext_html" in thing and thing["selftext_html"]:
thing["urls"].extend(get_links_from_body(thing["selftext_html"]))
thing["urls"].extend(get_links_from_body_html(thing["body_html"]))
elif "body" in thing and thing["body"]:
thing["urls"].extend(get_links_from_body(thing["body"]))
if "selftext_html" in thing and thing["selftext_html"]:
thing["urls"].extend(get_links_from_body_html(thing["selftext_html"]))
elif "selftext" in thing and thing["selftext"]:
thing["urls"].extend(get_links_from_body(thing["selftext"]))
if thing["urls"]:
print(thing["urls"])
if "url" in thing and thing["url"] and is_external(thing["url"]):
thing["urls"].append(thing["url"])
@ -19,9 +29,21 @@ def post_process(thing):
return thing
def get_links_from_body_html(body):
result = set()
for match in HTML_LINK_RE.finditer(body):
url = match.group(1)
if is_external(url):
result.add(url)
return list(result)
def get_links_from_body(body):
result = set()
body = body.replace("\\)", "(")
for match in LINK_RE.finditer(body):
url = match.group(1)
if is_external(url):

2
run.py
View File

@ -93,7 +93,7 @@ def serialize(thing):
"permalink": thing.permalink,
"pinned": thing.pinned,
"score": thing.score,
"selftext": thing.selftext,
"selftext": thing.selftext if hasattr(thing, "selftext") else None,
"selftext_html": thing.selftext_html if hasattr(thing, "selftext_html") else None,
"spoiler": thing.spoiler,
"stickied": thing.stickied,