1
0
mirror of https://github.com/simon987/poal_feed.git synced 2025-04-09 05:36:46 +00:00

Fix when crawler reaches realtime posts IDs

This commit is contained in:
simon987 2021-06-16 16:48:51 -04:00
parent 888c60c269
commit 3067ea2b96

15
poal.py

@ -67,6 +67,8 @@ class PoalScanner:
yield comment yield comment
def all_items(self): def all_items(self):
not_found_in_a_row = 0
for pid in range(1, 500_000): for pid in range(1, 500_000):
if self._state.has_visited(pid): if self._state.has_visited(pid):
continue continue
@ -74,8 +76,17 @@ class PoalScanner:
r = self._web.get(url) r = self._web.get(url)
if r.status_code == 404: if r.status_code == 404:
# Assume that we reached the end (?) for now not_found_in_a_row += 1
return
if not_found_in_a_row > 10:
break
if self._state.has_visited(pid + 1):
self._state.mark_visited(pid)
continue
not_found_in_a_row = 0
if r.status_code == 406: if r.status_code == 406:
# " This sub is disabled You're not allowed to see this stuff" # " This sub is disabled You're not allowed to see this stuff"