From 5eccee69cf019882c1826843de977f7e4628ab13 Mon Sep 17 00:00:00 2001 From: simon987 Date: Thu, 19 Mar 2020 13:47:21 -0400 Subject: [PATCH] docker tweaks --- Dockerfile | 11 +- chan/helper.py | 1 + docker-compose.yml | 411 ++++++++++++++++++++++++++++++++++++++++++++- post_process.py | 19 ++- run.py | 4 + util.py | 3 - 6 files changed, 434 insertions(+), 15 deletions(-) diff --git a/Dockerfile b/Dockerfile index d97f1f1..bf21a04 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,10 +1,11 @@ FROM python:3.8 -WORKDIR /app - -ADD requirements.txt /app/requirements.txt +ADD requirements.txt /requirements.txt RUN pip install -r requirements.txt -ENTRYPOINT ["python", "run.py"] - COPY . /app + +RUN chmod 777 -R /app + +WORKDIR /app +ENTRYPOINT ["python", "run.py"] diff --git a/chan/helper.py b/chan/helper.py index 32274bf..9ff3770 100644 --- a/chan/helper.py +++ b/chan/helper.py @@ -11,6 +11,7 @@ class ChanHelper: self._boards = boards self.rps = rps self.get_method = None + self.save_folder = None def boards(self): return [b.replace("\\_", "_") for b in self._boards if not b.startswith("_")] diff --git a/docker-compose.yml b/docker-compose.yml index 3f97489..ee24e2b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,9 +1,13 @@ version: "2.1" +volumes: + influxdb_data: + pg_data: + services: influxdb: image: influxdb:alpine volumes: - - ./influxdb_data:/var/lib/influxdb + - influxdb_data:/var/lib/influxdb grafana: image: grafana/grafana ports: @@ -13,7 +17,7 @@ services: db: image: postgres volumes: - - ./pg_data:/var/lib/postgresql/data + - pg_data:/var/lib/postgresql/data environment: - "POSTGRES_USER=feed_archiver" - "POSTGRES_PASSWORD=changeme" @@ -44,11 +48,412 @@ services: build: ./docker_viz/ ports: - 127.0.0.1:3005:80 - chan_4chan: + # Image boards + 0chan: image: simon987/chan_feed restart: always + user: ${CURRENT_UID} + volumes: + - ${SAVE_FOLDER}:/data/ + environment: + - "CF_CHAN=0chan" + - "CF_MQ_HOST=rabbitmq" + - "CF_INFLUXDB=influxdb" + - "CF_SAVE_FOLDER=/data/" + + 22chan: + image: simon987/chan_feed + restart: always + user: ${CURRENT_UID} + volumes: + - ${SAVE_FOLDER}:/data/ + environment: + - "CF_CHAN=22chan" + - "CF_MQ_HOST=rabbitmq" + - "CF_INFLUXDB=influxdb" + - "CF_SAVE_FOLDER=/data/" + + 2chan: + image: simon987/chan_feed + restart: always + user: ${CURRENT_UID} + volumes: + - ${SAVE_FOLDER}:/data/ + environment: + - "CF_CHAN=2chan" + - "CF_MQ_HOST=rabbitmq" + - "CF_INFLUXDB=influxdb" + - "CF_SAVE_FOLDER=/data/" + + + 2chhk: + image: simon987/chan_feed + restart: always + user: ${CURRENT_UID} + volumes: + - ${SAVE_FOLDER}:/data/ + environment: + - "CF_CHAN=2chhk" + - "CF_MQ_HOST=rabbitmq" + - "CF_INFLUXDB=influxdb" + - "CF_SAVE_FOLDER=/data/" + + 38chan: + image: simon987/chan_feed + restart: always + user: ${CURRENT_UID} + volumes: + - ${SAVE_FOLDER}:/data/ + environment: + - "CF_CHAN=38chan" + - "CF_MQ_HOST=rabbitmq" + - "CF_INFLUXDB=influxdb" + - "CF_SAVE_FOLDER=/data/" + + 410chan: + image: simon987/chan_feed + restart: always + user: ${CURRENT_UID} + volumes: + - ${SAVE_FOLDER}:/data/ + environment: + - "CF_CHAN=410chan" + - "CF_MQ_HOST=rabbitmq" + - "CF_INFLUXDB=influxdb" + - "CF_SAVE_FOLDER=/data/" + + 4chan: + image: simon987/chan_feed + restart: always + user: ${CURRENT_UID} + volumes: + - ${SAVE_FOLDER}:/data/ environment: - "CF_CHAN=4chan" - "CF_MQ_HOST=rabbitmq" - "CF_INFLUXDB=influxdb" + - "CF_SAVE_FOLDER=/data/" + 4kev: + image: simon987/chan_feed + restart: always + user: ${CURRENT_UID} + volumes: + - ${SAVE_FOLDER}:/data/ + environment: + - "CF_CHAN=4kev" + - "CF_MQ_HOST=rabbitmq" + - "CF_INFLUXDB=influxdb" + - "CF_SAVE_FOLDER=/data/" + + 7chan: + image: simon987/chan_feed + restart: always + user: ${CURRENT_UID} + volumes: + - ${SAVE_FOLDER}:/data/ + environment: + - "CF_CHAN=7chan" + - "CF_MQ_HOST=rabbitmq" + - "CF_INFLUXDB=influxdb" + - "CF_SAVE_FOLDER=/data/" + + 8kun: + image: simon987/chan_feed + restart: always + user: ${CURRENT_UID} + volumes: + - ${SAVE_FOLDER}:/data/ + environment: + - "CF_CHAN=8kun" + - "CF_MQ_HOST=rabbitmq" + - "CF_INFLUXDB=influxdb" + - "CF_SAVE_FOLDER=/data/" + + alokal: + image: simon987/chan_feed + restart: always + user: ${CURRENT_UID} + volumes: + - ${SAVE_FOLDER}:/data/ + environment: + - "CF_CHAN=alokal" + - "CF_MQ_HOST=rabbitmq" + - "CF_INFLUXDB=influxdb" + - "CF_SAVE_FOLDER=/data/" + + aurorachan: + image: simon987/chan_feed + restart: always + user: ${CURRENT_UID} + volumes: + - ${SAVE_FOLDER}:/data/ + environment: + - "CF_CHAN=aurorachan" + - "CF_MQ_HOST=rabbitmq" + - "CF_INFLUXDB=influxdb" + - "CF_SAVE_FOLDER=/data/" + + awsumchan: + image: simon987/chan_feed + restart: always + user: ${CURRENT_UID} + volumes: + - ${SAVE_FOLDER}:/data/ + environment: + - "CF_CHAN=awsumchan" + - "CF_MQ_HOST=rabbitmq" + - "CF_INFLUXDB=influxdb" + - "CF_SAVE_FOLDER=/data/" + + chanon: + image: simon987/chan_feed + restart: always + user: ${CURRENT_UID} + volumes: + - ${SAVE_FOLDER}:/data/ + environment: + - "CF_CHAN=chanon" + - "CF_MQ_HOST=rabbitmq" + - "CF_INFLUXDB=influxdb" + - "CF_SAVE_FOLDER=/data/" + + chanorg: + image: simon987/chan_feed + restart: always + user: ${CURRENT_UID} + volumes: + - ${SAVE_FOLDER}:/data/ + environment: + - "CF_CHAN=chanorg" + - "CF_MQ_HOST=rabbitmq" + - "CF_INFLUXDB=influxdb" + - "CF_SAVE_FOLDER=/data/" + + desuchan: + image: simon987/chan_feed + restart: always + user: ${CURRENT_UID} + volumes: + - ${SAVE_FOLDER}:/data/ + environment: + - "CF_CHAN=desuchan" + - "CF_MQ_HOST=rabbitmq" + - "CF_INFLUXDB=influxdb" + - "CF_SAVE_FOLDER=/data/" + + doushio: + image: simon987/chan_feed + restart: always + user: ${CURRENT_UID} + volumes: + - ${SAVE_FOLDER}:/data/ + environment: + - "CF_CHAN=doushio" + - "CF_MQ_HOST=rabbitmq" + - "CF_INFLUXDB=influxdb" + - "CF_SAVE_FOLDER=/data/" + + endchan: + image: simon987/chan_feed + restart: always + user: ${CURRENT_UID} + volumes: + - ${SAVE_FOLDER}:/data/ + environment: + - "CF_CHAN=endchan" + - "CF_MQ_HOST=rabbitmq" + - "CF_INFLUXDB=influxdb" + - "CF_SAVE_FOLDER=/data/" + + fchan: + image: simon987/chan_feed + restart: always + user: ${CURRENT_UID} + volumes: + - ${SAVE_FOLDER}:/data/ + environment: + - "CF_CHAN=fchan" + - "CF_MQ_HOST=rabbitmq" + - "CF_INFLUXDB=influxdb" + - "CF_SAVE_FOLDER=/data/" + + gnfos: + image: simon987/chan_feed + restart: always + user: ${CURRENT_UID} + volumes: + - ${SAVE_FOLDER}:/data/ + environment: + - "CF_CHAN=gnfos" + - "CF_MQ_HOST=rabbitmq" + - "CF_INFLUXDB=influxdb" + - "CF_SAVE_FOLDER=/data/" + + hispachan: + image: simon987/chan_feed + restart: always + user: ${CURRENT_UID} + volumes: + - ${SAVE_FOLDER}:/data/ + environment: + - "CF_CHAN=hispachan" + - "CF_MQ_HOST=rabbitmq" + - "CF_INFLUXDB=influxdb" + - "CF_SAVE_FOLDER=/data/" + + horochan: + image: simon987/chan_feed + restart: always + user: ${CURRENT_UID} + volumes: + - ${SAVE_FOLDER}:/data/ + environment: + - "CF_CHAN=horochan" + - "CF_MQ_HOST=rabbitmq" + - "CF_INFLUXDB=influxdb" + - "CF_SAVE_FOLDER=/data/" + + iichan: + image: simon987/chan_feed + restart: always + user: ${CURRENT_UID} + volumes: + - ${SAVE_FOLDER}:/data/ + environment: + - "CF_CHAN=iichan" + - "CF_MQ_HOST=rabbitmq" + - "CF_INFLUXDB=influxdb" + - "CF_SAVE_FOLDER=/data/" + + lainchan: + image: simon987/chan_feed + restart: always + user: ${CURRENT_UID} + volumes: + - ${SAVE_FOLDER}:/data/ + environment: + - "CF_CHAN=lainchan" + - "CF_MQ_HOST=rabbitmq" + - "CF_INFLUXDB=influxdb" + - "CF_SAVE_FOLDER=/data/" + + lolnada: + image: simon987/chan_feed + restart: always + user: ${CURRENT_UID} + volumes: + - ${SAVE_FOLDER}:/data/ + environment: + - "CF_CHAN=lolnada" + - "CF_MQ_HOST=rabbitmq" + - "CF_INFLUXDB=influxdb" + - "CF_SAVE_FOLDER=/data/" + + nowere: + image: simon987/chan_feed + restart: always + user: ${CURRENT_UID} + volumes: + - ${SAVE_FOLDER}:/data/ + environment: + - "CF_CHAN=nowere" + - "CF_MQ_HOST=rabbitmq" + - "CF_INFLUXDB=influxdb" + - "CF_SAVE_FOLDER=/data/" + + plus4chan: + image: simon987/chan_feed + restart: always + user: ${CURRENT_UID} + volumes: + - ${SAVE_FOLDER}:/data/ + environment: + - "CF_CHAN=plus4chan" + - "CF_MQ_HOST=rabbitmq" + - "CF_INFLUXDB=influxdb" + - "CF_SAVE_FOLDER=/data/" + + sushigirl: + image: simon987/chan_feed + restart: always + user: ${CURRENT_UID} + volumes: + - ${SAVE_FOLDER}:/data/ + environment: + - "CF_CHAN=sushigirl" + - "CF_MQ_HOST=rabbitmq" + - "CF_INFLUXDB=influxdb" + - "CF_SAVE_FOLDER=/data/" + + synch: + image: simon987/chan_feed + restart: always + user: ${CURRENT_UID} + volumes: + - ${SAVE_FOLDER}:/data/ + environment: + - "CF_CHAN=synch" + - "CF_MQ_HOST=rabbitmq" + - "CF_INFLUXDB=influxdb" + - "CF_SAVE_FOLDER=/data/" + + tahta: + image: simon987/chan_feed + restart: always + user: ${CURRENT_UID} + volumes: + - ${SAVE_FOLDER}:/data/ + environment: + - "CF_CHAN=tahta" + - "CF_MQ_HOST=rabbitmq" + - "CF_INFLUXDB=influxdb" + - "CF_SAVE_FOLDER=/data/" + + tgchan: + image: simon987/chan_feed + restart: always + user: ${CURRENT_UID} + volumes: + - ${SAVE_FOLDER}:/data/ + environment: + - "CF_CHAN=tgchan" + - "CF_MQ_HOST=rabbitmq" + - "CF_INFLUXDB=influxdb" + - "CF_SAVE_FOLDER=/data/" + + uboachan: + image: simon987/chan_feed + restart: always + user: ${CURRENT_UID} + volumes: + - ${SAVE_FOLDER}:/data/ + environment: + - "CF_CHAN=uboachan" + - "CF_MQ_HOST=rabbitmq" + - "CF_INFLUXDB=influxdb" + - "CF_SAVE_FOLDER=/data/" + + waifuist: + image: simon987/chan_feed + restart: always + user: ${CURRENT_UID} + volumes: + - ${SAVE_FOLDER}:/data/ + environment: + - "CF_CHAN=waifuist" + - "CF_MQ_HOST=rabbitmq" + - "CF_INFLUXDB=influxdb" + - "CF_SAVE_FOLDER=/data/" + + wizchan: + image: simon987/chan_feed + restart: always + user: ${CURRENT_UID} + volumes: + - ${SAVE_FOLDER}:/data/ + environment: + - "CF_CHAN=wizchan" + - "CF_MQ_HOST=rabbitmq" + - "CF_INFLUXDB=influxdb" + - "CF_SAVE_FOLDER=/data/" diff --git a/post_process.py b/post_process.py index a00f316..b2a7f6c 100644 --- a/post_process.py +++ b/post_process.py @@ -1,7 +1,8 @@ import hashlib +import os import zlib from io import BytesIO -from urllib.parse import urljoin +from urllib.parse import urljoin, urlparse import imagehash from PIL import Image @@ -31,13 +32,23 @@ def _is_image(url): return url.lower().endswith(IMAGE_FILETYPES) -def image_meta(url, url_idx, web): +def image_meta(url, url_idx, web, helper, board): r = web.get(url) if not r: logger.warning("Could not download image") return None buf = r.content + sha1 = hashlib.sha1(buf).hexdigest() + + if helper.save_folder: + path = os.path.join(helper.save_folder, str(helper.db_id), board) + path += "/" + sha1[0] + path += "/" + sha1[1:3] + os.makedirs(path, exist_ok=True) + with open(os.path.join(path, sha1 + os.path.splitext(url)[1]), "wb") as out: + out.write(buf) + try: f = BytesIO(buf) im = Image.open(f) @@ -47,7 +58,7 @@ def image_meta(url, url_idx, web): "size": len(buf), "width": im.width, "height": im.height, - "sha1": hashlib.sha1(buf).hexdigest(), + "sha1": sha1, "md5": hashlib.md5(buf).hexdigest(), "crc32": format(zlib.crc32(buf), "x"), "dhash": b64hash(imagehash.dhash(im, hash_size=12), 18), @@ -73,7 +84,7 @@ def post_process(item, board, helper, web): item["_urls"] = helper.item_urls(item, board) - item["_img"] = [image_meta(url, i, web) for i, url in enumerate(item["_urls"]) if _is_image(url)] + item["_img"] = [image_meta(url, i, web, helper, board) for i, url in enumerate(item["_urls"]) if _is_image(url)] return item diff --git a/run.py b/run.py index 399eabe..1563757 100644 --- a/run.py +++ b/run.py @@ -233,6 +233,10 @@ if __name__ == "__main__": rabbitmq_host = os.environ.get("CF_MQ_HOST", "localhost") chan = os.environ.get("CF_CHAN", None) chan_helper = CHANS[chan] + save_folder = os.environ.get("CF_SAVE_FOLDER", "") + + if save_folder: + chan_helper.save_folder = save_folder proxy = None if os.environ.get("CF_PROXY"): diff --git a/util.py b/util.py index 91af7c4..ab93082 100644 --- a/util.py +++ b/util.py @@ -16,11 +16,8 @@ logger = logging.getLogger("default") logger.setLevel(logging.DEBUG) formatter = logging.Formatter('%(asctime)s %(levelname)-5s %(message)s') -file_handler = FileHandler("chan_feed.log") -file_handler.setFormatter(formatter) for h in logger.handlers: logger.removeHandler(h) -logger.addHandler(file_handler) logger.addHandler(StreamHandler(sys.stdout))