mirror of
https://github.com/simon987/poal_feed.git
synced 2025-04-09 05:36:46 +00:00
Fix when crawler reaches realtime posts IDs
This commit is contained in:
parent
888c60c269
commit
3067ea2b96
15
poal.py
15
poal.py
@ -67,6 +67,8 @@ class PoalScanner:
|
|||||||
yield comment
|
yield comment
|
||||||
|
|
||||||
def all_items(self):
|
def all_items(self):
|
||||||
|
not_found_in_a_row = 0
|
||||||
|
|
||||||
for pid in range(1, 500_000):
|
for pid in range(1, 500_000):
|
||||||
if self._state.has_visited(pid):
|
if self._state.has_visited(pid):
|
||||||
continue
|
continue
|
||||||
@ -74,8 +76,17 @@ class PoalScanner:
|
|||||||
|
|
||||||
r = self._web.get(url)
|
r = self._web.get(url)
|
||||||
if r.status_code == 404:
|
if r.status_code == 404:
|
||||||
# Assume that we reached the end (?) for now
|
not_found_in_a_row += 1
|
||||||
return
|
|
||||||
|
if not_found_in_a_row > 10:
|
||||||
|
break
|
||||||
|
|
||||||
|
if self._state.has_visited(pid + 1):
|
||||||
|
self._state.mark_visited(pid)
|
||||||
|
|
||||||
|
continue
|
||||||
|
|
||||||
|
not_found_in_a_row = 0
|
||||||
|
|
||||||
if r.status_code == 406:
|
if r.status_code == 406:
|
||||||
# " This sub is disabled You're not allowed to see this stuff"
|
# " This sub is disabled You're not allowed to see this stuff"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user