Fix postref, add approx date

This commit is contained in:
simon987 2020-12-31 08:49:02 -05:00
parent a6ede3814b
commit 11901b7d6c
3 changed files with 33 additions and 20 deletions

View File

@ -35,11 +35,12 @@ class ParlerProfile(ParlerItem):
class ParlerFollower(ParlerItem): class ParlerFollower(ParlerItem):
def __init__(self, user_id, follower_id): def __init__(self, user_id, follower_id, approx_date):
super().__init__() super().__init__()
self.data = { self.data = {
"user_id": user_id, "user_id": user_id,
"follower_id": follower_id, "follower_id": follower_id,
"approx_date": approx_date,
} }
def item_type(self): def item_type(self):
@ -51,11 +52,12 @@ class ParlerFollower(ParlerItem):
class ParlerFollowee(ParlerItem): class ParlerFollowee(ParlerItem):
def __init__(self, user_id, followee_id): def __init__(self, user_id, followee_id, approx_date):
super().__init__() super().__init__()
self.data = { self.data = {
"user_id": user_id, "user_id": user_id,
"followee_id": followee_id, "followee_id": followee_id,
"approx_date": approx_date
} }
def item_type(self): def item_type(self):
@ -80,15 +82,19 @@ class ParlerPost(ParlerItem):
class ParlerPostRef(ParlerItem): class ParlerPostRef(ParlerItem):
def __init__(self, data): def __init__(self, post_id, user_id, approx_date):
super().__init__() super().__init__()
self.data = data self.data = {
"post_id": post_id,
"user_id": user_id,
"approx_date": approx_date
}
def item_type(self): def item_type(self):
return "postref" return "postref"
def item_id(self): def item_id(self):
return self.data["id"] return self.data["user_id"] + self.data["post_id"]
class ParlerUrl(ParlerItem): class ParlerUrl(ParlerItem):

View File

@ -29,24 +29,30 @@ if not MST:
class SessionDebugWrapper(requests.Session): class SessionDebugWrapper(requests.Session):
def get(self, url, **kwargs): def get(self, url, **kwargs):
retries = 3 retries = 4
while retries > 0: while retries > 0:
retries -= 1 retries -= 1
try: try:
r = super().get(url, **kwargs, timeout=15) r = super().get(url, **kwargs, timeout=45)
logger.debug( logger.debug(
"GET %s <%d>" "GET %s <%d>"
% (url + "?" + (urlencode(kwargs["params"]) if "params" in kwargs else ""), r.status_code) % (url + "?" + (urlencode(kwargs["params"]) if "params" in kwargs else ""), r.status_code)
) )
if r.status_code == 429: if r.status_code == 429:
sleep(1) sleep(15)
raise Exception("rate limited") raise Exception("rate limited")
if r.status_code == 502:
raise Exception("Server error")
return r return r
except KeyboardInterrupt as e: except KeyboardInterrupt as e:
raise e raise e
except Exception as e: except Exception as e:
logger.warning("Error with request %s: %s" % (url, str(e))) logger.warning(
"%s: %s"
% (url + "?" + (urlencode(kwargs["params"]) if "params" in kwargs else ""), str(e))
)
sleep(10)
raise Exception("Gave up request after maximum number of retries") raise Exception("Gave up request after maximum number of retries")
@ -112,7 +118,7 @@ class ParlerScanner:
for items_key in items_keys: for items_key in items_keys:
if items_key in j and j[items_key]: if items_key in j and j[items_key]:
for item in j[items_key]: for item in j[items_key]:
yield item, items_key yield item, items_key, current_key
self._state.set_resume_key(resume_endpoint, resume_id, current_key) self._state.set_resume_key(resume_endpoint, resume_id, current_key)
@ -122,27 +128,27 @@ class ParlerScanner:
current_key = j["next"] current_key = j["next"]
def user_followers(self, api, user_id): def user_followers(self, api, user_id):
for profile, key in self._iterate_endpoint( for profile, key, it_index in self._iterate_endpoint(
func=api.user_api.get_followers_for_user_id, func=api.user_api.get_followers_for_user_id,
params={"id": user_id}, params={"id": user_id},
resume_endpoint="followers", resume_endpoint="followers",
resume_id=user_id, resume_id=user_id,
items_keys=["followers"] items_keys=["followers"]
): ):
yield ParlerFollower(user_id=user_id, follower_id=profile["id"]) yield ParlerFollower(user_id=user_id, follower_id=profile["id"], approx_date=it_index)
def user_followees(self, api, user_id): def user_followees(self, api, user_id):
for profile, key in self._iterate_endpoint( for profile, key, it_index in self._iterate_endpoint(
func=api.user_api.get_following_for_user_id, func=api.user_api.get_following_for_user_id,
params={"id": user_id}, params={"id": user_id},
resume_endpoint="followees", resume_endpoint="followees",
resume_id=user_id, resume_id=user_id,
items_keys=["followees"] items_keys=["followees"]
): ):
yield ParlerFollowee(user_id=user_id, followee_id=profile["id"]) yield ParlerFollowee(user_id=user_id, followee_id=profile["id"], approx_date=it_index)
def user_posts(self, api, user_id): def user_posts(self, api, user_id):
for item, key in self._iterate_endpoint( for item, key, it_index in self._iterate_endpoint(
func=api.feed_api.get_users_feed, func=api.feed_api.get_users_feed,
params={"id": user_id}, params={"id": user_id},
resume_endpoint="posts", resume_endpoint="posts",
@ -153,12 +159,12 @@ class ParlerScanner:
if key == "posts": if key == "posts":
yield ParlerPost(data=item) yield ParlerPost(data=item)
elif key == "postRefs": elif key == "postRefs":
yield ParlerPostRef(data=item) yield ParlerPostRef(post_id=item["_id"], user_id=user_id, approx_date=it_index)
elif key == "urls": elif key == "urls":
yield ParlerUrl(data=item) yield ParlerUrl(data=item)
def post_comments(self, api, post_id): def post_comments(self, api, post_id):
for item, key in self._iterate_endpoint( for item, key, _ in self._iterate_endpoint(
func=api.comments_api.get_comments, func=api.comments_api.get_comments,
params={"id": post_id, "reverse": "true"}, params={"id": post_id, "reverse": "true"},
resume_endpoint="comments", resume_endpoint="comments",
@ -191,6 +197,8 @@ class ParlerScanner:
user_id = self._get_user_id_hash(api, int_id) user_id = self._get_user_id_hash(api, int_id)
if user_id: if user_id:
yield user_id, int_id yield user_id, int_id
else:
self._state.mark_visited_user(int_id)
def process_userid(self, api, user_id, int_id): def process_userid(self, api, user_id, int_id):
profile = self.fetch_profile(api, user_id, int_id) profile = self.fetch_profile(api, user_id, int_id)
@ -205,8 +213,7 @@ class ParlerScanner:
for post in self.user_posts(api, user_id): for post in self.user_posts(api, user_id):
yield post yield post
if (post.item_type() == "post" or post.item_type() == "postref") \ if post.item_type() == "post" and not self._state.has_visited_post(post):
and not self._state.has_visited_post(post):
for comment in self.post_comments(api, post.item_id()): for comment in self.post_comments(api, post.item_id()):
yield comment yield comment
self._state.mark_visited_post(post.item_id()) self._state.mark_visited_post(post.item_id())

View File

@ -3,7 +3,7 @@ import sys
from logging import StreamHandler from logging import StreamHandler
logger = logging.getLogger("default") logger = logging.getLogger("default")
logger.setLevel(logging.DEBUG) logger.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s <%(threadName)s> %(levelname)-5s %(message)s') formatter = logging.Formatter('%(asctime)s <%(threadName)s> %(levelname)-5s %(message)s')
for h in logger.handlers: for h in logger.handlers: