add retropublish utility script

This commit is contained in:
simon987 2019-08-15 15:30:19 -04:00
parent 7bc68283ea
commit 342397b93e
3 changed files with 38 additions and 32 deletions

View File

@ -1,3 +1,4 @@
praw praw
pika pika
influxdb influxdb
psaw

26
retropublish.py Normal file
View File

@ -0,0 +1,26 @@
# Script to retroactively publish reddit items from pushshift for a specific subreddit
import traceback
import psaw
import sys
from run import publish, logger
if len(sys.argv) != 3:
print("Usage: ./retropublish.py post|comment subreddit")
item_type = sys.argv[1]
subreddit = sys.argv[2]
p = psaw.PushshiftAPI()
if item_type == "post":
gen = p.search_submissions(subreddit=subreddit, limit=100000000)
else:
gen = p.search_comments(subreddit=subreddit, limit=1000000000)
for item in gen:
try:
publish(item)
except Exception as e:
logger.error(str(e) + ": " + traceback.format_exc())

43
run.py
View File

@ -30,6 +30,11 @@ reddit._core._rate_limiter = GoodRateLimiter()
logger = logging.getLogger("default") logger = logging.getLogger("default")
logger.setLevel(logging.DEBUG) logger.setLevel(logging.DEBUG)
rabbit = pika.BlockingConnection(pika.ConnectionParameters(host="localhost"))
reddit_channel = rabbit.channel()
reddit_channel.exchange_declare(exchange="reddit", exchange_type="topic")
REALTIME_DELAY = timedelta(seconds=60) REALTIME_DELAY = timedelta(seconds=60)
MONITORING = True MONITORING = True
@ -48,14 +53,11 @@ def serialize(thing):
"author": str(thing.author), "author": str(thing.author),
"author_flair_text": thing.author_flair_text, "author_flair_text": thing.author_flair_text,
"body": thing.body, "body": thing.body,
"body_html": thing.body_html, "selftext_html": thing.body_html if hasattr(thing, "body_html") else None,
"controversiality": thing.controversiality, "controversiality": thing.controversiality,
"created": int(thing.created), "created": int(thing.created),
"created_utc": int(thing.created_utc), "created_utc": int(thing.created_utc),
"downs": thing.downs, "downs": thing.downs if hasattr(thing, "downs") else 0,
"edited": thing.edited,
"gilded": thing.gilded,
"gildings": thing.gildings,
"id": thing.id, "id": thing.id,
"link_id": thing.link_id, "link_id": thing.link_id,
"name": thing.name, "name": thing.name,
@ -69,19 +71,11 @@ def serialize(thing):
} }
else: else:
return { return {
"archived": thing.archived,
"author": str(thing.author), "author": str(thing.author),
"author_flair_text": thing.author_flair_text, "author_flair_text": thing.author_flair_text,
"category": thing.category,
"comment_limit": thing.comment_limit,
"created": int(thing.created), "created": int(thing.created),
"created_utc": int(thing.created_utc), "created_utc": int(thing.created_utc),
"discussion_type": thing.discussion_type,
"domain": thing.domain, "domain": thing.domain,
"downs": thing.downs,
"edited": int(thing.edited),
"gilded": thing.gilded,
"gildings": thing.gildings,
"id": thing.id, "id": thing.id,
"is_crosspostable": thing.is_crosspostable, "is_crosspostable": thing.is_crosspostable,
"is_meta": thing.is_meta, "is_meta": thing.is_meta,
@ -90,37 +84,26 @@ def serialize(thing):
"is_robot_indexable": thing.is_robot_indexable, "is_robot_indexable": thing.is_robot_indexable,
"is_self": thing.is_self, "is_self": thing.is_self,
"is_video": thing.is_video, "is_video": thing.is_video,
"likes": thing.likes, "link_flair_text": thing.link_flair_text if hasattr(thing, "link_flair_text") else None,
"link_flair_text": thing.link_flair_text,
"locked": thing.locked, "locked": thing.locked,
"media": thing.media,
"media_embed": thing.media_embed,
"media_only": thing.media_only,
"name": thing.name,
"num_comments": thing.num_comments, "num_comments": thing.num_comments,
"num_crossposts": thing.num_crossposts, "num_crossposts": thing.num_crossposts,
"num_reports": thing.num_reports,
"over_18": thing.over_18, "over_18": thing.over_18,
"parent_whitelist_status": thing.parent_whitelist_status, "parent_whitelist_status": thing.parent_whitelist_status,
"permalink": thing.permalink, "permalink": thing.permalink,
"pinned": thing.pinned, "pinned": thing.pinned,
"quarantine": thing.quarantine,
"score": thing.score, "score": thing.score,
"secure_media": thing.secure_media,
"secure_media_embed": thing.secure_media_embed,
"selftext": thing.selftext, "selftext": thing.selftext,
"selftext_html": thing.selftext_html, "selftext_html": thing.selftext_html if hasattr(thing, "selftext_html") else None,
"spoiler": thing.spoiler, "spoiler": thing.spoiler,
"stickied": thing.stickied, "stickied": thing.stickied,
"subreddit": str(thing.subreddit), "subreddit": str(thing.subreddit),
"subreddit_id": thing.subreddit_id, "subreddit_id": thing.subreddit_id,
"subreddit_subscribers": thing.subreddit_subscribers, "subreddit_subscribers": thing.subreddit_subscribers,
"subreddit_type": thing.subreddit_type, "subreddit_type": thing.subreddit_type,
"thumbnail": thing.thumbnail,
"thumbnail_height": thing.thumbnail_height,
"thumbnail_width": thing.thumbnail_width,
"title": thing.title, "title": thing.title,
"ups": thing.ups, "ups": thing.ups if hasattr(thing, "ups") else 0,
"downs": thing.downs if hasattr(thing, "downs") else 0,
"url": thing.url, "url": thing.url,
} }
@ -216,10 +199,6 @@ if __name__ == "__main__":
monitoring.init() monitoring.init()
pub_queue = Queue() pub_queue = Queue()
rabbit = pika.BlockingConnection(pika.ConnectionParameters(host=sys.argv[1]))
reddit_channel = rabbit.channel()
reddit_channel.exchange_declare(exchange="reddit", exchange_type="topic")
while True: while True:
try: try:
publish_thread = threading.Thread(target=publish_worker, args=(pub_queue,)) publish_thread = threading.Thread(target=publish_worker, args=(pub_queue,))