From 3067ea2b96568e78e952e77ca4010cb3f1aeb5e0 Mon Sep 17 00:00:00 2001 From: simon987 Date: Wed, 16 Jun 2021 16:48:51 -0400 Subject: [PATCH] Fix when crawler reaches realtime posts IDs --- poal.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/poal.py b/poal.py index e18cab8..c1ecf9e 100644 --- a/poal.py +++ b/poal.py @@ -67,6 +67,8 @@ class PoalScanner: yield comment def all_items(self): + not_found_in_a_row = 0 + for pid in range(1, 500_000): if self._state.has_visited(pid): continue @@ -74,8 +76,17 @@ class PoalScanner: r = self._web.get(url) if r.status_code == 404: - # Assume that we reached the end (?) for now - return + not_found_in_a_row += 1 + + if not_found_in_a_row > 10: + break + + if self._state.has_visited(pid + 1): + self._state.mark_visited(pid) + + continue + + not_found_in_a_row = 0 if r.status_code == 406: # " This sub is disabled You're not allowed to see this stuff"