1
0
mirror of https://github.com/simon987/poal_feed.git synced 2025-04-09 13:46:46 +00:00

Update scraper with new version of hexlib

This commit is contained in:
simon987 2021-03-07 11:08:52 -05:00
parent 69b6902fef
commit d2bfcea44d
6 changed files with 24 additions and 104 deletions

@ -5,6 +5,7 @@ services:
image: simon987/poal_feed
restart: always
environment:
- "PF_REDIS_HOST="
- "PF_RPS=1"
- "REDIS_HOST="
- "RPS=1"
- "USER_AGENT=firefox"
- "PF_MAX_PAGE=300"

@ -3,17 +3,18 @@ import os
from json import JSONDecodeError
from urllib.parse import urljoin
from hexlib.env import get_web
from hexlib.log import logger
from post_process import get_links_from_body
from state import PoalState
from util import Web, logger
PF_MAX_PAGE = int(os.environ.get("PF_MAX_PAGE", 9999999))
class PoalHelper:
def __init__(self, url, rps, boards):
self.rps = rps
def __init__(self, url, boards):
self._boards = boards
self._url = url
@ -107,7 +108,7 @@ class PoalScanner:
def __init__(self, state: PoalState, helper: PoalHelper):
self._state = state
self._helper = helper
self._web = Web(rps=helper.rps)
self._web = get_web()
def _posts(self, board):
r = self._web.get(self._helper.posts_url(board))

@ -1,4 +1,4 @@
requests
urllib3
git+git://github.com/simon987/hexlib.git
redis
git+git://github.com/simon987/hexlib.git

46
run.py

@ -1,36 +1,18 @@
import json
import os
import traceback
from queue import Queue
from threading import Thread
import redis
from hexlib.concurrency import queue_iter
from hexlib.env import get_redis
from poal import PoalScanner, PoalHelper
from post_process import post_process
from state import PoalState
from util import logger
REDIS_HOST = os.environ.get("PF_REDIS_HOST", "localhost")
REDIS_PORT = os.environ.get("PF_REDIS_PORT", 6379)
PF_PUBLISH = os.environ.get("PF_PUBLISH", False)
PF_RPS = os.environ.get("PF_RPS", 1)
ARC_LISTS = os.environ.get("PF_ARC_LISTS", "arc").split(",")
def publish_worker(queue: Queue, helper):
while True:
try:
item, board = queue.get()
if item is None:
break
publish(item, board, helper)
except Exception as e:
logger.error(str(e) + ": " + traceback.format_exc())
finally:
queue.task_done()
for item, board in queue_iter(queue):
publish(item, board, helper)
def once(func):
@ -50,10 +32,7 @@ def publish(item, board, helper):
routing_key = "%s.%s" % (item_type, board)
message = json.dumps(item, separators=(',', ':'), ensure_ascii=False, sort_keys=True)
if PF_PUBLISH:
rdb.publish("poal." + routing_key, message)
for arc in ARC_LISTS:
rdb.lpush(arc + ".poal." + routing_key, message)
rdb.lpush("arc.poal." + routing_key, message)
HELPER = PoalHelper(
@ -61,20 +40,18 @@ HELPER = PoalHelper(
"all",
# TODO: Are there hidden boards that do not show up in /all ?
),
rps=PF_RPS,
url="https://poal.co"
)
if __name__ == "__main__":
state = PoalState("poal", REDIS_HOST, REDIS_PORT)
rdb = redis.Redis(host=REDIS_HOST, port=REDIS_PORT)
state = PoalState("poal")
rdb = get_redis()
publish_q = Queue()
for _ in range(3):
publish_thread = Thread(target=publish_worker, args=(publish_q, HELPER))
publish_thread.setDaemon(True)
publish_thread.start()
publish_thread = Thread(target=publish_worker, args=(publish_q, HELPER))
publish_thread.setDaemon(True)
publish_thread.start()
s = PoalScanner(state, HELPER)
while True:
@ -83,6 +60,5 @@ if __name__ == "__main__":
publish_q.put((item, board))
except KeyboardInterrupt as e:
print("cleanup..")
for _ in range(3):
publish_q.put((None, None))
publish_q.put((None, None))
break

@ -2,10 +2,10 @@ from hexlib.db import VolatileState, VolatileBooleanState
class PoalState:
def __init__(self, prefix, host, port):
self._posts = VolatileState(prefix, host=host, port=port)
self._comments = VolatileBooleanState(prefix, host=host, port=port)
self._users = VolatileBooleanState(prefix, host=host, port=port)
def __init__(self, prefix):
self._posts = VolatileState(prefix)
self._comments = VolatileBooleanState(prefix)
self._users = VolatileBooleanState(prefix)
def has_visited(self, item_id):
return self._comments["comments"][item_id]

58
util.py

@ -1,58 +0,0 @@
import logging
import sys
import traceback
from logging import StreamHandler
import requests
from hexlib.misc import rate_limit
logger = logging.getLogger("default")
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s %(levelname)-5s %(message)s')
for h in logger.handlers:
logger.removeHandler(h)
logger.addHandler(StreamHandler(sys.stdout))
UA = "User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:83.0) Gecko/20100101 Firefox/83.0"
class Web:
def __init__(self, rps):
self.session = requests.Session()
self._rps = rps
@rate_limit(self._rps)
def _get(url, **kwargs):
if "headers" in kwargs:
kwargs["headers"]["User-Agent"] = UA
else:
kwargs["headers"] = {"User-Agent": UA}
retries = 8
while retries > 0:
retries -= 1
try:
r = self.session.get(url, **kwargs)
if r.status_code == 500:
raise Exception("Server error")
return r
except KeyboardInterrupt as e:
raise e
except Exception as e:
logger.warning("Error with request %s: %s" % (url, str(e)))
raise Exception("Gave up request after maximum number of retries")
self._get = _get
def get(self, url, **kwargs):
try:
r = self._get(url, **kwargs)
logger.debug("GET %s <%d>" % (url, r.status_code))
return r
except KeyboardInterrupt as e:
raise e
except Exception as e:
logger.error(str(e) + traceback.format_exc())
return None