mirror of
				https://github.com/simon987/poal_feed.git
				synced 2025-11-04 02:46:55 +00:00 
			
		
		
		
	Fix when crawler reaches realtime posts IDs
This commit is contained in:
		
							parent
							
								
									888c60c269
								
							
						
					
					
						commit
						3067ea2b96
					
				
							
								
								
									
										15
									
								
								poal.py
									
									
									
									
									
								
							
							
						
						
									
										15
									
								
								poal.py
									
									
									
									
									
								
							@ -67,6 +67,8 @@ class PoalScanner:
 | 
				
			|||||||
            yield comment
 | 
					            yield comment
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def all_items(self):
 | 
					    def all_items(self):
 | 
				
			||||||
 | 
					        not_found_in_a_row = 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        for pid in range(1, 500_000):
 | 
					        for pid in range(1, 500_000):
 | 
				
			||||||
            if self._state.has_visited(pid):
 | 
					            if self._state.has_visited(pid):
 | 
				
			||||||
                continue
 | 
					                continue
 | 
				
			||||||
@ -74,8 +76,17 @@ class PoalScanner:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
            r = self._web.get(url)
 | 
					            r = self._web.get(url)
 | 
				
			||||||
            if r.status_code == 404:
 | 
					            if r.status_code == 404:
 | 
				
			||||||
                # Assume that we reached the end (?) for now
 | 
					                not_found_in_a_row += 1
 | 
				
			||||||
                return
 | 
					
 | 
				
			||||||
 | 
					                if not_found_in_a_row > 10:
 | 
				
			||||||
 | 
					                    break
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                if self._state.has_visited(pid + 1):
 | 
				
			||||||
 | 
					                    self._state.mark_visited(pid)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                continue
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            not_found_in_a_row = 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            if r.status_code == 406:
 | 
					            if r.status_code == 406:
 | 
				
			||||||
                # " This sub is disabled You're not allowed to see this stuff"
 | 
					                # " This sub is disabled You're not allowed to see this stuff"
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user