mirror of
https://github.com/simon987/poal_feed.git
synced 2025-04-17 17:36:42 +00:00
Update scraper with new version of hexlib
This commit is contained in:
parent
69b6902fef
commit
d2bfcea44d
@ -5,6 +5,7 @@ services:
|
|||||||
image: simon987/poal_feed
|
image: simon987/poal_feed
|
||||||
restart: always
|
restart: always
|
||||||
environment:
|
environment:
|
||||||
- "PF_REDIS_HOST="
|
- "REDIS_HOST="
|
||||||
- "PF_RPS=1"
|
- "RPS=1"
|
||||||
|
- "USER_AGENT=firefox"
|
||||||
- "PF_MAX_PAGE=300"
|
- "PF_MAX_PAGE=300"
|
||||||
|
9
poal.py
9
poal.py
@ -3,17 +3,18 @@ import os
|
|||||||
from json import JSONDecodeError
|
from json import JSONDecodeError
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
from hexlib.env import get_web
|
||||||
|
from hexlib.log import logger
|
||||||
|
|
||||||
from post_process import get_links_from_body
|
from post_process import get_links_from_body
|
||||||
from state import PoalState
|
from state import PoalState
|
||||||
from util import Web, logger
|
|
||||||
|
|
||||||
PF_MAX_PAGE = int(os.environ.get("PF_MAX_PAGE", 9999999))
|
PF_MAX_PAGE = int(os.environ.get("PF_MAX_PAGE", 9999999))
|
||||||
|
|
||||||
|
|
||||||
class PoalHelper:
|
class PoalHelper:
|
||||||
|
|
||||||
def __init__(self, url, rps, boards):
|
def __init__(self, url, boards):
|
||||||
self.rps = rps
|
|
||||||
self._boards = boards
|
self._boards = boards
|
||||||
self._url = url
|
self._url = url
|
||||||
|
|
||||||
@ -107,7 +108,7 @@ class PoalScanner:
|
|||||||
def __init__(self, state: PoalState, helper: PoalHelper):
|
def __init__(self, state: PoalState, helper: PoalHelper):
|
||||||
self._state = state
|
self._state = state
|
||||||
self._helper = helper
|
self._helper = helper
|
||||||
self._web = Web(rps=helper.rps)
|
self._web = get_web()
|
||||||
|
|
||||||
def _posts(self, board):
|
def _posts(self, board):
|
||||||
r = self._web.get(self._helper.posts_url(board))
|
r = self._web.get(self._helper.posts_url(board))
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
requests
|
requests
|
||||||
urllib3
|
urllib3
|
||||||
git+git://github.com/simon987/hexlib.git
|
|
||||||
redis
|
redis
|
||||||
|
git+git://github.com/simon987/hexlib.git
|
||||||
|
46
run.py
46
run.py
@ -1,36 +1,18 @@
|
|||||||
import json
|
import json
|
||||||
import os
|
|
||||||
import traceback
|
|
||||||
from queue import Queue
|
from queue import Queue
|
||||||
from threading import Thread
|
from threading import Thread
|
||||||
|
|
||||||
import redis
|
from hexlib.concurrency import queue_iter
|
||||||
|
from hexlib.env import get_redis
|
||||||
|
|
||||||
from poal import PoalScanner, PoalHelper
|
from poal import PoalScanner, PoalHelper
|
||||||
from post_process import post_process
|
from post_process import post_process
|
||||||
from state import PoalState
|
from state import PoalState
|
||||||
from util import logger
|
|
||||||
|
|
||||||
REDIS_HOST = os.environ.get("PF_REDIS_HOST", "localhost")
|
|
||||||
REDIS_PORT = os.environ.get("PF_REDIS_PORT", 6379)
|
|
||||||
PF_PUBLISH = os.environ.get("PF_PUBLISH", False)
|
|
||||||
PF_RPS = os.environ.get("PF_RPS", 1)
|
|
||||||
|
|
||||||
ARC_LISTS = os.environ.get("PF_ARC_LISTS", "arc").split(",")
|
|
||||||
|
|
||||||
|
|
||||||
def publish_worker(queue: Queue, helper):
|
def publish_worker(queue: Queue, helper):
|
||||||
while True:
|
for item, board in queue_iter(queue):
|
||||||
try:
|
publish(item, board, helper)
|
||||||
item, board = queue.get()
|
|
||||||
if item is None:
|
|
||||||
break
|
|
||||||
publish(item, board, helper)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(str(e) + ": " + traceback.format_exc())
|
|
||||||
finally:
|
|
||||||
queue.task_done()
|
|
||||||
|
|
||||||
|
|
||||||
def once(func):
|
def once(func):
|
||||||
@ -50,10 +32,7 @@ def publish(item, board, helper):
|
|||||||
routing_key = "%s.%s" % (item_type, board)
|
routing_key = "%s.%s" % (item_type, board)
|
||||||
|
|
||||||
message = json.dumps(item, separators=(',', ':'), ensure_ascii=False, sort_keys=True)
|
message = json.dumps(item, separators=(',', ':'), ensure_ascii=False, sort_keys=True)
|
||||||
if PF_PUBLISH:
|
rdb.lpush("arc.poal." + routing_key, message)
|
||||||
rdb.publish("poal." + routing_key, message)
|
|
||||||
for arc in ARC_LISTS:
|
|
||||||
rdb.lpush(arc + ".poal." + routing_key, message)
|
|
||||||
|
|
||||||
|
|
||||||
HELPER = PoalHelper(
|
HELPER = PoalHelper(
|
||||||
@ -61,20 +40,18 @@ HELPER = PoalHelper(
|
|||||||
"all",
|
"all",
|
||||||
# TODO: Are there hidden boards that do not show up in /all ?
|
# TODO: Are there hidden boards that do not show up in /all ?
|
||||||
),
|
),
|
||||||
rps=PF_RPS,
|
|
||||||
url="https://poal.co"
|
url="https://poal.co"
|
||||||
)
|
)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
state = PoalState("poal", REDIS_HOST, REDIS_PORT)
|
state = PoalState("poal")
|
||||||
rdb = redis.Redis(host=REDIS_HOST, port=REDIS_PORT)
|
rdb = get_redis()
|
||||||
|
|
||||||
publish_q = Queue()
|
publish_q = Queue()
|
||||||
for _ in range(3):
|
publish_thread = Thread(target=publish_worker, args=(publish_q, HELPER))
|
||||||
publish_thread = Thread(target=publish_worker, args=(publish_q, HELPER))
|
publish_thread.setDaemon(True)
|
||||||
publish_thread.setDaemon(True)
|
publish_thread.start()
|
||||||
publish_thread.start()
|
|
||||||
|
|
||||||
s = PoalScanner(state, HELPER)
|
s = PoalScanner(state, HELPER)
|
||||||
while True:
|
while True:
|
||||||
@ -83,6 +60,5 @@ if __name__ == "__main__":
|
|||||||
publish_q.put((item, board))
|
publish_q.put((item, board))
|
||||||
except KeyboardInterrupt as e:
|
except KeyboardInterrupt as e:
|
||||||
print("cleanup..")
|
print("cleanup..")
|
||||||
for _ in range(3):
|
publish_q.put((None, None))
|
||||||
publish_q.put((None, None))
|
|
||||||
break
|
break
|
||||||
|
8
state.py
8
state.py
@ -2,10 +2,10 @@ from hexlib.db import VolatileState, VolatileBooleanState
|
|||||||
|
|
||||||
|
|
||||||
class PoalState:
|
class PoalState:
|
||||||
def __init__(self, prefix, host, port):
|
def __init__(self, prefix):
|
||||||
self._posts = VolatileState(prefix, host=host, port=port)
|
self._posts = VolatileState(prefix)
|
||||||
self._comments = VolatileBooleanState(prefix, host=host, port=port)
|
self._comments = VolatileBooleanState(prefix)
|
||||||
self._users = VolatileBooleanState(prefix, host=host, port=port)
|
self._users = VolatileBooleanState(prefix)
|
||||||
|
|
||||||
def has_visited(self, item_id):
|
def has_visited(self, item_id):
|
||||||
return self._comments["comments"][item_id]
|
return self._comments["comments"][item_id]
|
||||||
|
58
util.py
58
util.py
@ -1,58 +0,0 @@
|
|||||||
import logging
|
|
||||||
import sys
|
|
||||||
import traceback
|
|
||||||
from logging import StreamHandler
|
|
||||||
|
|
||||||
import requests
|
|
||||||
from hexlib.misc import rate_limit
|
|
||||||
|
|
||||||
logger = logging.getLogger("default")
|
|
||||||
logger.setLevel(logging.DEBUG)
|
|
||||||
|
|
||||||
formatter = logging.Formatter('%(asctime)s %(levelname)-5s %(message)s')
|
|
||||||
for h in logger.handlers:
|
|
||||||
logger.removeHandler(h)
|
|
||||||
logger.addHandler(StreamHandler(sys.stdout))
|
|
||||||
|
|
||||||
UA = "User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:83.0) Gecko/20100101 Firefox/83.0"
|
|
||||||
|
|
||||||
|
|
||||||
class Web:
|
|
||||||
def __init__(self, rps):
|
|
||||||
self.session = requests.Session()
|
|
||||||
self._rps = rps
|
|
||||||
|
|
||||||
@rate_limit(self._rps)
|
|
||||||
def _get(url, **kwargs):
|
|
||||||
if "headers" in kwargs:
|
|
||||||
kwargs["headers"]["User-Agent"] = UA
|
|
||||||
else:
|
|
||||||
kwargs["headers"] = {"User-Agent": UA}
|
|
||||||
retries = 8
|
|
||||||
|
|
||||||
while retries > 0:
|
|
||||||
retries -= 1
|
|
||||||
try:
|
|
||||||
r = self.session.get(url, **kwargs)
|
|
||||||
if r.status_code == 500:
|
|
||||||
raise Exception("Server error")
|
|
||||||
return r
|
|
||||||
except KeyboardInterrupt as e:
|
|
||||||
raise e
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Error with request %s: %s" % (url, str(e)))
|
|
||||||
raise Exception("Gave up request after maximum number of retries")
|
|
||||||
|
|
||||||
self._get = _get
|
|
||||||
|
|
||||||
def get(self, url, **kwargs):
|
|
||||||
try:
|
|
||||||
r = self._get(url, **kwargs)
|
|
||||||
|
|
||||||
logger.debug("GET %s <%d>" % (url, r.status_code))
|
|
||||||
return r
|
|
||||||
except KeyboardInterrupt as e:
|
|
||||||
raise e
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(str(e) + traceback.format_exc())
|
|
||||||
return None
|
|
Loading…
x
Reference in New Issue
Block a user